In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,accuracy_score, classification_report

In [None]:
!pip install --upgrade transformers
!pip install simpletransformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/88/b1/41130a228dd656a1a31ba281598a968320283f48d42782845f6ba567f00b/transformers-4.2.2-py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 8.4MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 36.8MB/s 
[?25hCollecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 54.0MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=74f7f34f4959f

In [None]:
# importing important libraries
# checking if the libraries are all good,or is missing thing!
# used try/except to handle it!

try:
  from sklearn.metrics import f1_score
  from sklearn.model_selection import train_test_split
  from simpletransformers.classification import ClassificationModel, ClassificationArgs
  import pandas as pd
  import numpy as np
  import seaborn as sns
  sns.set_theme(style="darkgrid")

except Exception as e:
  print('Package Error! \n\n {}'.format(e))

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#EDA
# calling the train/test data, just to make everything easy!
train= pd.read_csv('/content/drive/MyDrive/sarcasem/training_data.csv')

train.head()

Unnamed: 0,tweet,sarcasm,sentiment,dialect
0,"""د. #محمود_العلايلي:أرى أن الفريق #أحمد_شفيق ر...",False,NEU,msa
1,"""مع فيدرر يا آجا والكبار 😍 https://t.co/hrBeHb...",False,NEU,msa
2,“الداعون لمبدأ الاختلاط بين الجنسين؛ كالداعين ...,True,NEG,msa
3,"""@ihe_94 @ya78m @amooo5 @badiajnikhar @Oukasaf...",True,NEG,gulf
4,"""قل شرق حلب ولا تقل حلب الشرقية ....وقل غرب حل...",False,NEU,msa


In [None]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r' ', text)
train['tweet']=train["tweet"].apply(lambda text: remove_urls(text))

In [None]:
test['tweet']=test["tweet"].apply(lambda text: remove_urls(text))

In [None]:
def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r' ', text)
train["tweet"] = train["tweet"].apply(lambda text: remove_html(text))

In [None]:
test["tweet"] = test["tweet"].apply(lambda text: remove_html(text))

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:

'''
The first step is to subject the data to preprocessing.
This involves removing both arabic and english punctuation
Normalizing different letter variants with one common letter
'''
# first we define a list of arabic and english punctiations that we want to get rid of in our text

punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ''' + string.punctuation

# Arabic stop words with nltk
stop_words = stopwords.words()

arabic_diacritics = re.compile("""
                             ّ    | # Shadda
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)

def preprocess(text):
    
    '''
    text is an arabic string input
    
    the preprocessed text is returned
    '''
    
    #remove punctuations
    translator = str.maketrans(' ', ' ', punctuations)
    text = text.translate(translator)
    
    # remove Tashkeel
    text = re.sub(arabic_diacritics, ' ', text)
    
    #remove longation
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)

    text = ' '.join(word for word in text.split() if word not in stop_words)

    return text
  
train['tweet'] = train['tweet'].apply(preprocess)
print(train.head(5))


                                               tweet  sarcasm sentiment dialect
0  د محمودالعلايلياري ان الفريق احمدشفيق رقم مهم ...    False       NEU     msa
1                                فيدرر اجا والكبار 😍    False       NEU     msa
2  الداعون لمبدا الاختلاط الجنسين كالداعين لالغاء...     True       NEG     msa
3  ihe94 ya78m amooo5 badiajnikhar Oukasafa reosh...     True       NEG    gulf
4  قل شرق حلب تقل حلب الشرقيه وقل غرب حلب تقل حلب...    False       NEU     msa


In [None]:
test['tweet'] = test['tweet'].apply(preprocess)


In [None]:
train

Unnamed: 0,tweet,sarcasm,sentiment,dialect
0,د محمودالعلايلياري ان الفريق احمدشفيق رقم مهم ...,False,NEU,msa
1,فيدرر اجا والكبار 😍,False,NEU,msa
2,الداعون لمبدا الاختلاط الجنسين كالداعين لالغاء...,True,NEG,msa
3,ihe94 ya78m amooo5 badiajnikhar Oukasafa reosh...,True,NEG,gulf
4,قل شرق حلب تقل حلب الشرقيه وقل غرب حلب تقل حلب...,False,NEU,msa
...,...,...,...,...
12543,صاير انت فلعوط بدك تعطي محاضرات ع تويتر بالكهر...,True,NEG,levant
12544,اله الا الله💜ايفونالبروفيسور,False,NEU,egypt
12545,RT turkyepost اردوغان اذا كان المرتكب مسلم يسم...,False,NEU,msa
12546,RT Yousiif65 هاري بوتر👓🎩,False,POS,egypt


In [None]:
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)
train["tweet"] = train["tweet"].apply(lambda text: remove_emoji(text))


In [None]:
test["tweet"] = test["tweet"].apply(lambda text: remove_emoji(text))


In [None]:
test

Unnamed: 0,tweet,dialect
0,اخوي حانق يالغلا وشفيك معصب عادي تراهم بشر يفط...,msa
1,اف مو متعوده عليهم سته,msa
2,اللهم اشف مرضانا ومرضي المسلمين,msa
3,ابشركم طلقت السات,gulf
4,مءشر خطير ٩٠٪ الشخصيات البرلمانيه الكويت تعرض ...,msa
...,...,...
2995,تربكني الذكري ليا مر طاريهوانسي البشر حولي وال...,msa
2996,وانا احسبهم الحين مايتركون حركاتهم,msa
2997,فههههههههههدغشششششششاااااااامالببببببصصصصصمانا...,msa
2998,كان الامر بيدي لاخفيت انهيار دموعي سحقا لتلك ا...,msa


In [None]:
train= train.iloc[:,0:2]
train.head()

Unnamed: 0,tweet,sarcasm
0,د محمودالعلايلياري ان الفريق احمدشفيق رقم مهم ...,False
1,فيدرر اجا والكبار 😍,False
2,الداعون لمبدا الاختلاط الجنسين كالداعين لالغاء...,True
3,ihe94 ya78m amooo5 badiajnikhar Oukasafa reosh...,True
4,قل شرق حلب تقل حلب الشرقيه وقل غرب حلب تقل حلب...,False


In [None]:
id=[]
for i in range (len(train)):
  id.append(i)
train['id']=id

In [None]:
train = pd.DataFrame({'id': id,'text':train['tweet'],'label':train['sarcasm']})


In [None]:
test=pd.read_csv('/content/drive/MyDrive/sarcasem/test_set.csv')


In [None]:
# init the train columns!
train.columns = ['id','text', 'labels']
print(train.columns)

Index(['id', 'text', 'labels'], dtype='object')


In [None]:
# declaring the train_df  and valid_df values!
train_df, valid_df = train_test_split(train, test_size=0.05, random_state=10)

In [None]:
model_args = ClassificationArgs(num_train_epochs=5, train_batch_size=8, learning_rate=1e-5, overwrite_output_dir=True, manual_seed=17, silent=True)
model = ClassificationModel(model_type='bert', model_name='aubmindlab/bert-large-arabertv02' ,use_cuda=True, num_labels=2, args=model_args)

Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.48G [00:00<?, ?B/s]

Some weights of the model checkpoint at aubmindlab/bert-large-arabertv02 were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification we

Downloading:   0%|          | 0.00/825k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/382 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.64M [00:00<?, ?B/s]

In [None]:
# training the model, with the train set!
model.train_model(train_df)



(7450, 0.47319778829037945)

In [None]:
#predicting using the test set!
test_predictions, raw_outputs = model.predict(test['tweet'])

In [None]:
sarcasm=test_predictions
submit = pd.DataFrame({'sarcasm':sarcasm})
submit.to_csv('test.csv', index=False)

In [None]:
def digit_convertor(x):
    if x==0:
        return 'FALSE'
    else:
        return 'TRUE'

In [None]:
 submit['sarcasm']=submit['sarcasm'].apply(lambda x : digit_convertor(x))

In [None]:
submit.to_csv('dalya_Subtask_1.csv', index=False)