In [1]:
# Imports PIL module 
import pandas as pd
import spacy
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [2]:
PATH_DATA = './data/data_nlp/'
AD1_FILE = PATH_DATA + 'MeTooHate.csv'
CHUNK_SIZE = 1000

df = pd.read_csv(AD1_FILE)

In [3]:
df.drop(columns=['status_id', 'created_at', 'favorite_count', 'retweet_count',
       'location', 'followers_count', 'friends_count', 'statuses_count',
       ], inplace=True)

df.dropna(inplace=True)
df.shape


(803638, 2)

In [4]:
df.isna().sum()

text        0
category    0
dtype: int64

In [5]:
nlp = spacy.load('en_core_web_sm')
punct = string.punctuation
punct

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [6]:
import re

def remove_url(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)

def remove_html(text):
    html = re.compile(r"<.*?>")
    return html.sub(r"", text)

def remove_emoji(text):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE
    )
    return emoji_pattern.sub(r"", text)

def text_data_cleaning(sentence):
    sentence = remove_url(sentence)
    sentence = remove_html(sentence)
    sentence = remove_emoji(sentence)
    doc = nlp(sentence)

    tokens = []
    for token in doc:
        if token.lemma_ != "-PRON-":
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)
    
    cleaned_tokens = []
    for token in tokens:
        if token not in nlp.Defaults.stop_words and token not in punct:
            cleaned_tokens.append(token)
    return cleaned_tokens

In [7]:
text_data_cleaning(df.text[3])

['yep',
 'like',
 'triffele',
 'woman',
 'weaponize',
 'poon',
 'wonder',
 'kamala',
 'harris',
 'extort',
 'willy',
 'brown',
 'throw',
 'poon',
 'oh',
 'yeh',
 'job',
 'joke']

In [8]:
df.text[3]

'Yep just like triffeling women weaponized their poon!! Wonder if Kamala Harris ever extorted Willy Brown after throwing the poon on him, oh yeh, that how she got her first job me too is a JOKE! '

In [9]:
remove_emoji(df.text[9])

" Isn't it nice that you know everything?       \n      "

***TFIDF***

In [10]:
from sklearn.svm import LinearSVC

In [37]:
tfidf = TfidfVectorizer(tokenizer=text_data_cleaning)
classifier = LinearSVC(verbose=True)

In [38]:
X = df.text
y = df.category

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42 )

In [40]:
X_train.shape, X_test.shape

((642910,), (160728,))

In [41]:
X_train[524689]

"so... abusing women??? I guess karma is a bitch, huh!!! Remember your mantra...!!  So, those rules should serve you well, right??!! Good luck Creepy porn lawyer.. couldn't have happened to a nicer guy!! JERK"

In [42]:
clf = Pipeline([('tfidf', tfidf), ('clf', classifier)], verbose=True)

In [43]:
clf.fit(X_train, y_train)

[Pipeline] ............ (step 1 of 2) Processing tfidf, total=111.5min
[LibLinear].....*
optimization finished, #iter = 55
Objective value = -106844.563668
nSV = 298712
[Pipeline] ............... (step 2 of 2) Processing clf, total=  14.5s


Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<function text_data_cleaning at 0x7f8dbb262280>)),
                ('clf', LinearSVC(verbose=True))],
         verbose=True)

In [44]:
y_pred = clf.predict(X_test)

In [45]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97    141532
           1       0.88      0.59      0.71     19196

    accuracy                           0.94    160728
   macro avg       0.91      0.79      0.84    160728
weighted avg       0.94      0.94      0.94    160728



In [46]:
confusion_matrix(y_test, y_pred)

array([[140001,   1531],
       [  7799,  11397]])

In [76]:
[X_train[524689]]
clf.predict(['love women'])

array([0])

In [71]:
import pickle
filename = './data/data_nlp/classifier.sav'
pickle.dump(clf, open(filename, 'wb'))

In [70]:
df

Unnamed: 0,text,category
0,"Entitled, obnoxious, defensive, lying weasel. ...",0
1,Thank you and for what you did for the women...,0
2,Knitting (s) &amp; getting ready for January 1...,0
3,Yep just like triffeling women weaponized thei...,1
4,"No, the President wants to end movement posin...",0
...,...,...
807169,Let’s not forget that this “iconic kiss” was u...,0
807170,DEFINITELY....the only one any of us should su...,0
807171,Did the movement count the dollars of Erin An...,0
807172,This is one of my all time fav songs &amp; vid...,1


***PREDICTIONS***

In [79]:
year = 2016 # available: from 2015 to 2020
PATH_DATA = './data/data_nlp/'
QUOTES_FILE = PATH_DATA + f'quotes-{year}-filtered.json.bz2'
CHUNK_SIZE = 500

reader = pd.read_json(QUOTES_FILE, lines=True, compression='bz2', chunksize=CHUNK_SIZE, typ='frame')

chunks = [] #utile pour plus loins quand on fait le feature extraction
i=0
for chunk in reader:
    df_0 = chunk
    break
df_0.quotation[0]

"[ Malia ] knows what she is going to do. They have a plan for her and her family feels comfortable knowing that it's not something unstructured,"

In [137]:
year = 2020 # available: from 2015 to 2020
PATH_DATA = './data/data_nlp/'
QUOTES_FILE = PATH_DATA + 'metoo_2019_04.json.bz2'

df = pd.read_json(QUOTES_FILE, lines=True, compression='bz2', typ='frame')


In [141]:
df_new = df.iloc[:100]

In [142]:
df_new['Hatred'] = df_new.quotation.apply(lambda quote: clf.predict([quote]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['Hatred'] = df_new.quotation.apply(lambda quote: clf.predict([quote]))


In [158]:
s = df_new[df_new.Hatred == 1].quotation.iloc[3]

In [159]:
def split(text, n=100):
    return []

'Thankfully, with places like Optimism Place, we have supports put in place for people who are involved, or who are victims or survivors of domestic violence and we have the ability to help those people through those problems -- not only with the survivors, but with those families as well in helping to prevent any further problems from happening.'

In [164]:
s_s = [s[100*i:100*(i+1)] for i in range(len(s)//100)] + [s[100*(len(s)//100):]];s_s

['Thankfully, with places like Optimism Place, we have supports put in place for people who are involv',
 'ed, or who are victims or survivors of domestic violence and we have the ability to help those peopl',
 'e through those problems -- not only with the survivors, but with those families as well in helping ',
 'to prevent any further problems from happening.']

In [167]:
for s in s_s:
    clf.predict([s])


In [168]:
clf.predict(s_s)

array([0, 1, 1, 0])