In [49]:
import pandas as pd
import spacy
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from train_spacy import remove_url, remove_html, remove_emoji
import re


***Training pipeline***

In [50]:
PATH_DATA = './data/data_nlp/'
AD1_FILE = PATH_DATA + 'original_labels.csv'
CHUNK_SIZE = 1000

df = pd.read_csv(AD1_FILE)
df.head(3)

Unnamed: 0,entry_id,link_id,parent_id,entry_utc,subreddit,author,body,image,label_date,week,group,sheet_order,annotator_id,level_1,level_2,level_3,strength,highlight
0,exoxn7,t3_exoxn7,,1580652620,badwomensanatomy,doggodone,Do you have the skin of a 80 year old grandma?...,Yes,17-02-2020,1,1,"(1,)",3,Nonmisogynistic,None_of_the_categories,,,
1,exoxn7,t3_exoxn7,,1580652620,badwomensanatomy,doggodone,Do you have the skin of a 80 year old grandma?...,Yes,17-02-2020,1,1,"(1,)",6,Nonmisogynistic,None_of_the_categories,,,
2,exoxn7,t3_exoxn7,,1580652620,badwomensanatomy,doggodone,Do you have the skin of a 80 year old grandma?...,Yes,17-02-2020,1,1,"(1,)",4,Misogynistic,Derogation,Sexual_or_physical_limitations,Nature of the abuse is Implicit,"skin of a 80 year old grandma? Worry no more, ..."


In [51]:
df.drop(columns=['entry_id', 'link_id', 'parent_id', 'entry_utc', 'subreddit', 'author',
       'label_date', 'week', 'group', 'sheet_order',
       'annotator_id', 'level_2', 'level_3', 'strength',
       'highlight'], inplace=True)

In [52]:
one_hot = pd.get_dummies(df['level_1'])
df['misogynistic'] = one_hot.Misogynistic.values

In [53]:
df = df[~df['image'].isin(['Yes'])]
df.drop(['image', 'level_1'], axis=1, inplace=True)
df.dropna(inplace=True)
df = df[~df['body'].isin(['[removed]'])]

In [54]:
nlp = spacy.load('en_core_web_sm')
punct = string.punctuation

In [13]:
def remove_url(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)

def remove_html(text):
    html = re.compile(r"<.*?>")
    return html.sub(r"", text)

def remove_emoji(text):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE
    )
    return emoji_pattern.sub(r"", text)

def text_data_cleaning(sentence):
    sentence = remove_url(sentence)
    sentence = remove_html(sentence)
    sentence = remove_emoji(sentence)
    doc = nlp(sentence)

    tokens = []
    for token in doc:
        if token.lemma_ != "-PRON-":
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)
    
    cleaned_tokens = []
    for token in tokens:
        if token not in nlp.Defaults.stop_words and token not in punct:
            cleaned_tokens.append(token)
    return cleaned_tokens

***TFIDF***

In [17]:
from sklearn.svm import LinearSVC

In [18]:
tfidf = TfidfVectorizer(tokenizer=text_data_cleaning, ngram_range=(1,2))
classifier = LinearSVC()

In [19]:
X = df.body
y = df.misogynistic

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42 )

In [21]:
X_train.shape, X_test.shape

((12160,), (3040,))

In [22]:
X_train[1231]

'Bernie is a commie jew....no wonder the sheep masses love him.\n\nI wonder why the system hates him so much...probably because it\'d be too transparent and people , white people, may start to wake up to our genocide if someone who is openly jewish and anti white was elected...\n\nRemember, they are "white" just like us , goyim'

In [23]:
clf = Pipeline([('tfidf', tfidf), ('clf', classifier)])

In [24]:
clf.fit(X_train, y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(ngram_range=(1, 2),
                                 tokenizer=<function text_data_cleaning at 0x7fe9db22c9d0>)),
                ('clf', LinearSVC())])

In [25]:
y_pred = clf.predict(X_test)

In [26]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.95      0.97      0.96      2709
           1       0.72      0.58      0.65       331

    accuracy                           0.93      3040
   macro avg       0.84      0.78      0.80      3040
weighted avg       0.93      0.93      0.93      3040



In [27]:
confusion_matrix(y_test, y_pred)

array([[2635,   74],
       [ 138,  193]])

In [28]:
import pickle
filename = './data/data_nlp/classifier2.sav'
pickle.dump(clf, open(filename, 'wb'))

***PREDICTIONS***

In [None]:
import pickle
clf = pickle.load(open('./data/data_nlp/classifier2.sav', 'rb'))

In [26]:
year = 2019 # available: from 2015 to 2020
PATH_DATA = './data/data_nlp/predict/'
QUOTES_FILE = PATH_DATA + f'{year}_male.json.bz2'

df = pd.read_json(QUOTES_FILE, lines=True, compression='bz2', typ='frame')


In [28]:
df['Mysoginistic'] = df.quotation.apply(lambda quote: clf.predict([quote]))

In [33]:
men_mysoginistic = df[df.Mysoginistic==1]
men_mysoginistic.shape

(56, 12)

In [35]:
for i, quote in enumerate(men_mysoginistic.quotation):
    print(f'{men_mysoginistic.iloc[i].speaker}: "{quote}"')

Jon Ronson: "Somebody HIV + should rape this bitch."
Mel Gibson: "You look like a f *** ing bitch in heat. And if you get raped by a pack of n **** s it will be your fault. Alright? Because you provoked it. You are provocatively dressed all the time with your fake boobs that you feel you have to show off. I don't like it. I don't want that woman. I don't want you. I don't trust you. I don't love you."
Stanley Ntagali: "We condemn in the strongest term possible, actions of rape and violence against women and girls. As a church, we are deeply concerned about the increasing trend of sexual and gender-based violence in the country especially towards women and girls including the continued harmful practices such as Female Genital Mutilation and child marriages. I am also concerned about the increasing cases of child molesting and defilement and its negative consequences."
Ty Dolla $ ign: "Got too many hoes, I just leave 'em on read / Too much sex, shawty, you can give me head,"
ansel elgort

In [37]:
year = 2018 # available: from 2015 to 2020
PATH_DATA = './data/data_nlp/predict/'
QUOTES_FILE = PATH_DATA + f'{year}_male.json.bz2'

df = pd.read_json(QUOTES_FILE, lines=True, compression='bz2', typ='frame')


In [38]:
df['Mysoginistic'] = df.quotation.apply(lambda quote: clf.predict([quote]))

In [39]:
men_mysoginistic = df[df.Mysoginistic==1]
men_mysoginistic.shape

(86, 12)

In [41]:
for i, quote in enumerate(men_mysoginistic.quotation):
    print(f'{men_mysoginistic.iloc[i].speaker}: "{quote}"')

Alex Hall: "Feminism causes women to hate men, kill their children, become witches, whores & lesbians."
Kanye West: "feels like [ me ] and Taylor might still have sex/I made that bitch famous,"
Snoop Dogg: "I don't give a fuck about how sexy you think the bitch look. That's a fucking doll, nigga."
Sheikh Mohammed: "The ministry is offering its courses to both sexes, not only to girls. The success of a marital relationship is the responsibility of both women and men. Limiting premarital courses to girls gives the impression that they are responsible for the high divorce rates."
Frank Partnoy: "women were called lapdogs, bitches, and whores. Male brokers groped women, demanded sex, and gave women genital-shaped food. At Merrill Lynch, women were humiliated by strippers at office parties.... At Lew Lieberbaum, there were strippers, cat calls, and demands for oral sex."
Joe Budden: "I should have sex in the coffin `cause I'm killin' these hoes."
Alex Jones: "sexual acts that can kill you."