In [1]:
import pandas as pd

df = pd.read_csv('../input/arabic-dialects/aradic_dialects.csv', lineterminator='\n')

df.tail()

Unnamed: 0,id,dialect,text
458192,1019484980282580992,BH,@Al_mhbaa_7 مبسوطين منك اللي باسطانا😅
458193,1021083283709407232,BH,@Zzainabali @P_ameerah والله ماينده ابش يختي
458194,1017477537889431552,BH,@Al_mhbaa_7 شو عملنا لك حنا تهربي مننا احنا مس...
458195,1022430374696239232,BH,@haneenalmwla الله يبارك فيها وبالعافيه 😋😋😋
458196,1022409931029458944,BH,@jolnar121 السحله ضيفي ي بتطلع لك سحليه😅😅


In [2]:
import emoji
from nltk.corpus import stopwords

def avg_word(sentence):
    words = sentence.split()
    if len(words) == 0:
        return 0
    return sum(len(word) for word in words)/len(words)

def emoji_counter(sentence):
    return emoji.emoji_count(sentence)

df['word_count'] = df['text'].apply(lambda x: len(str(x).split(" ")))
df['char_count'] = df['text'].str.len()
df['avg_char_per_word'] = df['text'].apply(avg_word)
stop = stopwords.words('arabic')
df['stopwords'] = df['text'].apply(lambda x: len([x for x in x.split() if x in stop]))
df['emoji_count'] = df['text'].apply(emoji_counter)
df.tail()

Unnamed: 0,id,dialect,text,word_count,char_count,avg_char_per_word,stopwords,emoji_count
458192,1019484980282580992,BH,@Al_mhbaa_7 مبسوطين منك اللي باسطانا😅,5,37,6.6,0,1
458193,1021083283709407232,BH,@Zzainabali @P_ameerah والله ماينده ابش يختي,6,44,6.5,0,0
458194,1017477537889431552,BH,@Al_mhbaa_7 شو عملنا لك حنا تهربي مننا احنا مس...,14,73,4.285714,1,2
458195,1022430374696239232,BH,@haneenalmwla الله يبارك فيها وبالعافيه 😋😋😋,6,43,6.333333,1,3
458196,1022409931029458944,BH,@jolnar121 السحله ضيفي ي بتطلع لك سحليه😅😅,7,41,5.0,2,2


In [3]:
import pyarabic.araby as araby
import re
def normalizeArabic(text):
    text = text.strip()
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    noise = re.compile(""" ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    text = re.sub(noise, '', text)
    text = re.sub(r'(.)\1+', r"\1\1", text) # Remove longation
    return araby.strip_tashkeel(text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                                   u"\U0001F600-\U0001F64F"  # emoticons
                                   u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                   u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                   u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   u"\U00002702-\U000027B0"
                                   u"\U000024C2-\U0001F251"
                                   "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text

def clean_tweet(text):
    text = re.sub('http\S+\s*', ' ', text)  # remove URLs
    text = re.sub('RT|cc', ' ', text)  # remove RT and cc
    text = re.sub('@[^\s]+',' ',text)
    text = re.sub('#[^\s]+',' ',text)
    return text

def clean_text(text):
    ## Clean for tweets
    text = clean_tweet(text)
    ## Remove Emojis
    text = remove_emoji(text)
    ## Remove punctuations
    text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,،-./:;<=>؟?@[\]^_`{|}~"""), ' ', text)  # remove punctuation
    ## remove extra whitespace
    text = re.sub('\s+', ' ', text)  
    ## Remove numbers
    text = re.sub("\d+", " ", text)
    ## Remove Tashkeel
    text = normalizeArabic(text)
    #text = re.sub('\W+', ' ', text)
    text = re.sub('[A-Za-z]+',' ',text)
    # strip
    text = text.strip()
    return text if text else None

df['clean_text'] = df['text'].apply(clean_text)

df.tail()

Unnamed: 0,id,dialect,text,word_count,char_count,avg_char_per_word,stopwords,emoji_count,clean_text
458192,1019484980282580992,BH,@Al_mhbaa_7 مبسوطين منك اللي باسطانا😅,5,37,6.6,0,1,مبسوطين منك اللي باسطانا
458193,1021083283709407232,BH,@Zzainabali @P_ameerah والله ماينده ابش يختي,6,44,6.5,0,0,والله ماينده ابش يختي
458194,1017477537889431552,BH,@Al_mhbaa_7 شو عملنا لك حنا تهربي مننا احنا مس...,14,73,4.285714,1,2,شو عملنا لك حنا تهربي مننا احنا مساكين ليش بتع...
458195,1022430374696239232,BH,@haneenalmwla الله يبارك فيها وبالعافيه 😋😋😋,6,43,6.333333,1,3,الله يبارك فيها وبالعافيه
458196,1022409931029458944,BH,@jolnar121 السحله ضيفي ي بتطلع لك سحليه😅😅,7,41,5.0,2,2,السحله ضيفي ي بتطلع لك سحليه


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458197 entries, 0 to 458196
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   id                 458197 non-null  int64  
 1   dialect            458197 non-null  object 
 2   text               458197 non-null  object 
 3   word_count         458197 non-null  int64  
 4   char_count         458197 non-null  int64  
 5   avg_char_per_word  458197 non-null  float64
 6   stopwords          458197 non-null  int64  
 7   emoji_count        458197 non-null  int64  
 8   clean_text         457562 non-null  object 
dtypes: float64(1), int64(5), object(3)
memory usage: 31.5+ MB


In [5]:
df[df.isnull().any(axis=1)]

Unnamed: 0,id,dialect,text,word_count,char_count,avg_char_per_word,stopwords,emoji_count,clean_text
1317,880691917603917824,IQ,#يحرر_الابطال_ما_باعه_الارذال https://t.co/knc...,2,53,26.000000,0,0,
2425,731440698625564672,IQ,#شطر_تهديه_للي_فبالك https://t.co/gK1VbXnUvv,2,44,21.500000,0,0,
2907,937865370555375616,IQ,#مستمرون_للدعم\n#مستمرون_بالخشم https://t.co/v...,2,54,17.333333,0,0,
4030,962413041638891648,IQ,#اسقاط_طايره_اسراييليه #سوريا https://t.co/c0R...,3,53,17.000000,0,0,
4457,471056063672750080,IQ,ﻱ ﺷﻮﻕ ﻗﻠﮧ ﺻﺎﺍﺣﺒﮏ ﺩﻭﻡ ﻳﻄﺮﻳﮏ ﻭﺎﻧﮏ ﺏ ﻗﻠﺒﮧ ﻟﻮ ﺗﻄﻮﻝ...,14,61,3.428571,0,0,
...,...,...,...,...,...,...,...,...,...
449878,694521412095115264,BH,#جامع_السلطان_أحمد_اسطنبول https://t.co/570cdl...,2,50,24.500000,0,0,
452674,1175057498979676160,BH,#تغيير_مكتب_تويتر_بدبي,1,22,22.000000,0,0,
454965,673759660936155136,BH,#الجو_مو_مال_دوام.,1,18,18.000000,0,0,
456554,1027820957895938048,BH,#معصيتي_راحتي #سكس_كتابي #سكس #سحاق #دياثة #جن...,8,76,8.625000,0,0,


In [6]:
df.dropna(axis=0, inplace=True)

In [7]:
df.to_csv('arabic_dialects_clean.csv', index=False)