## Basic Models (non-transformer-based)

In [1]:
!pip install demoji

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import gspread
import pandas as pd
from tensorflow import keras
import numpy as np
from google.colab import auth
auth.authenticate_user()
from google.auth import default
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, StratifiedKFold
import torch

creds, _ = default()
gc = gspread.authorize(creds)

In [3]:
worksheet = gc.open('USE DATASET').worksheet("Sheet1")
rows = worksheet.get_all_values()
df = pd.DataFrame.from_records(rows)
df.columns=df.iloc[0]
df = df.drop([0])
df = df[['sentence', 'final', 'before', 'after']] # keep only the sentence and class cols
df = df.dropna()
df = df[(df["final"] != "") & (df["final"] != "Undecided")]
df

Unnamed: 0,sentence,final,before,after
1,* 4 * When those Muslims came to attack Parlia...,oppression,* * When that Muslim When he came to attack 26...,* * Even when those Muslims were going to figh...
2,His sisters or sisters-in-law begin to force him.,none,So these Muslim friends start visiting the hom...,And how many such girls from Hindu families ar...
3,"” Entered, broke the Shivling into pieces, and...",culture,His scattered remains were surely nurturing th...,That property was estimated at two crore dirhams.
4,Muslims should change their instincts.,action,There is a lot of poison in the speech of Musl...,Muslims have got their desired thing in Pakist...
5,"They buy potato, onion, ginger etc.",none,There is cold storage for 👉 👉 keeping your o...,at very cheap prices directly from farmers dur...
...,...,...,...,...
6996,"More recently, you may have seen the ""Abu Bakr...",oppression,"From the ""Ganga-Jamuni Tehzeeb"", drunken Hindu...","Like the goats in Pakistan, Hindu girls would ..."
6997,* It is clear from these four examples how wro...,none,In this way he says that when the world adopts...,* The great prophet Savarkar * In 1952 Savarka...
6998,"In his view, atheist means one who does not be...",culture,The Islamic view of looking at the borders of ...,"Islam is not just a religion, in fact Islam is..."
6999,🚩 _ * * 🚩 🕉 🕉 ⚜ ⛳ ⚜ - 😬 Nurul Rahman Ba...,action,"We should not be 📲 📱 begging anyone, we know...",ूHindu cannot stop our Bangladeshi brothers fr...


In [4]:
df = df.sample(frac=1).reset_index(drop=True)
replacement = {
    'none': 0,
    'oppression': 1,
    'action': 2,
    'culture': 3
}

df['labels'] = df['final'].map(replacement)
df_keep = df[['sentence', 'labels']]
df_keep.columns = ["text", "labels"]
df_keep

Unnamed: 0,text,labels
0,Hizbul Mujahideen = Terrorist Organization 6.,0
1,"* Hindus will get India killed like Pakistan, ...",0
2,"* * We, the great Akbar's great-grandson Dara ...",0
3,Will sit and corrupt the temples?,0
4,"Pakistan is a laboratory for terrorism, not a ...",0
...,...,...
6995,The Congress has fielded Rahul Gandhi as a UDF...,0
6996,This was the aim of many Muslim invaders to co...,1
6997,Methods of his normal behavior have been displ...,0
6998,"*Like the Hindus of Kerala, no other community...",0


## Combining Sentences & Oversampling

In [5]:
combs = []
labels = df['labels']

for i in range(len(df)):
    combs.append(df.loc[i].before + " " + df.loc[i].sentence)

df_combs = pd.DataFrame(data={
    'text': combs,
    'labels': labels
})
df_combs

Unnamed: 0,text,labels
0,Hamas = Terrorist Organization 5. Hizbul Mujah...,0
1,* Will give 100% government jobs to Christians...,0
2,He was so great and generous that even the bes...,0
3,"* According to Chapter 28, Sai Baba forced a B...",0
4,Will never buy this product. Pakistan is a lab...,0
...,...,...
6995,Wayanad has been a stronghold of the Muslim Le...,0
6996,"""Gajwa-e-Hind"" means conquering India and esta...",1
6997,Islamist politics is active in many names arou...,0
6998,"* At the same time, the Muslim share is 30, 2...",0


In [6]:
df_combs = df_combs.sample(frac=1).reset_index(drop=True)
df_combs.columns = ["text", "labels"]
df_combs

Unnamed: 0,text,labels
0,* Do not try to see communalism with a single ...,1
1,"Etc., etc., but would never say to improve tha...",0
2,"In this, he has said that reservation should b...",0
3,6 - Islam will enter its bill. Muhammad himsel...,3
4,Have made superflops and made such articles an...,3
...,...,...
6995,"* Now there is nothing in #India for them, ""We...",0
6996,The entire Sindh belonged to the Sindhis seven...,1
6997,Mahatma Gandhi used to present himself as a be...,1
6998,But if you are a Kashmiri Sunni and a lot of w...,0


In [7]:
# Oversampling the dataset
c0 = df_combs[df_combs['labels'] == 0]
c1 = df_combs[df_combs['labels'] == 1]
c2 = df_combs[df_combs['labels'] == 2]
c3 = df_combs[df_combs['labels'] == 3]

c1_over = c1.sample(len(c0), replace=True)
c2_over = c2.sample(len(c0), replace=True)
c3_over = c3.sample(len(c0), replace=True)

df_oversample = pd.concat([c0, c1_over, c2_over, c3_over], axis=0)
df_oversample['labels'].value_counts()

0    3923
1    3923
2    3923
3    3923
Name: labels, dtype: int64

## Dataset Cleaning

In [8]:
import re
import nltk
import demoji
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

stemmer = WordNetLemmatizer()


class Cleaner:
  def replace_emoji_and_dup(self, text):
    '''
    function takes in string `text` parameter and removes all duplicate characters in a row that are greater than 2 
    -replaces the emojis with their descriptions
    '''
    tracker = {}
    final = []

    # check if the past two characters were that same char

    for i in range(len(text)):
      if i < 2:
        final.append(text[i])
        continue
      if (text[i-1] == text[i]) and (text[i-2] == text[i]):
        pass
      else:
        final.append(text[i])

    final = "".join(final)
    return demoji.replace_with_desc(final)

  def replace_url(self, text):
    '''
    function takes a `string` parameter text and replaces all URLS with `_URL` 
    '''
    return re.sub(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)", ' _URL', text)


  def final_clean(self, text):
    text = re.sub(r'\W', ' ', str(text))
    
    # remove all single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    
    # Remove single characters from the start
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text) 
    
    # Substituting multiple spaces with single space
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    
    # Removing prefixed 'b'
    text = re.sub(r'^b\s+', '', text)
    
    # Converting to Lowercase
    text = text.lower()
    
    # Lemmatization
    text = text.split()

    text = [stemmer.lemmatize(word) for word in text]
    text = ' '.join(text)
    
    return text

  def clean_text(self, text):
    '''
    MASTER FUNCTION
    '''
    # remove emojis and duplicate chars
    clean1 = self.replace_emoji_and_dup(str(text))

    # replace links with _URL
    clean2 = self.replace_url(clean1)

    final = self.final_clean(clean2)

    return final

Cleaner = Cleaner()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [9]:
def prepreocess_text(text):
    return Cleaner.clean_text(text)

df_oversample['text'] = df_oversample['text'].apply(prepreocess_text)
df_oversample

Unnamed: 0,text,labels
1,etc etc but would never say to improve that bo...,0
2,in this he ha said that reservation should be ...,0
5,the objection and the plight of hindu coming f...,0
6,shushing face ज ajib is ironic यह for some of ...,0
7,he wa accompanied by his sister wife and nephe...,0
...,...,...
4500,and towards india it is also spreading it fun ...,3
3569,hindu muslim sikh christian sabal 16 what is t...,3
6539,the entire hindu society will have to get orga...,3
4081,there are son but to blunt their anger we star...,3


In [18]:
#@title Load the Universal Sentence Encoder's TF Hub module
from absl import logging

import tensorflow as tf

import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns

module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
modelHub = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
  return modelHub(input)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [11]:
#@title Compute a representation for each message, showing various lengths supported.
word = "Elephant"
sentence = "I am a sentence for which I would like to get its embedding."
paragraph = (
    "Universal Sentence Encoder embeddings also support short paragraphs. "
    "There is no hard limit on how long the paragraph is. Roughly, the longer "
    "the more 'diluted' the embedding will be.")
messages = [sentence]

# Reduce logging output.
logging.set_verbosity(logging.ERROR)

message_embeddings = embed(df_oversample['text'])

In [12]:
len(np.array(message_embeddings))

15692

In [13]:
df_oversample

Unnamed: 0,text,labels
1,etc etc but would never say to improve that bo...,0
2,in this he ha said that reservation should be ...,0
5,the objection and the plight of hindu coming f...,0
6,shushing face ज ajib is ironic यह for some of ...,0
7,he wa accompanied by his sister wife and nephe...,0
...,...,...
4500,and towards india it is also spreading it fun ...,3
3569,hindu muslim sikh christian sabal 16 what is t...,3
6539,the entire hindu society will have to get orga...,3
4081,there are son but to blunt their anger we star...,3


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas  as pd
from io import StringIO
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler #fixed import


In [21]:
# All the models that we are going to try
models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    SVC(kernel='rbf'),#SVM
    MultinomialNB(),#Naive Bayes classifier for multinomial models
    LogisticRegression(random_state=0),
]

def get_avg(lst):
    return sum(lst) / len(lst)

for model in models:
    model_name = model.__class__.__name__
    print(f'----------------------Model: {model_name}----------------------')
    none_precisions = []
    none_recalls = []
    none_f1s = []

    action_precisions = []
    action_recalls = []
    action_f1s = []
    
    oppression_precisions = []
    oppression_recalls = []
    oppression_f1s = []

    culture_precisions = []
    culture_recalls = []
    culture_f1s = []

    var = 0
    n=10
    skf = StratifiedKFold(n_splits=n, random_state=1337, shuffle=True)
    confusions = []


    features = embed(df_oversample['text']).numpy()


    labels = df_oversample['labels']


    for i, x in enumerate(skf.split(df_oversample['text'], df_oversample['labels'])):
        df_train = df_oversample.iloc[x[0].tolist()]
        df_test = df_oversample.iloc[x[1].tolist()]

        axis=1
        features_train = features[x[0]]
        features_test = features[x[1]]

        

        X_train = features_train
        y_train = df_train['labels']
        X_test = features_test
        y_test = df_test['labels']
        

        print(f'{i}th split...')
        print(df_train.shape)
        print(df_test.shape)



        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        report = metrics.classification_report(y_test, y_pred, target_names = ['none', 'oppression', 'action', 'culture'], output_dict=True)

        none_precisions.append(report['none']['precision'])
        none_recalls.append(report['none']['recall'])
        none_f1s.append(report['none']['f1-score'])

        action_precisions.append(report['action']['precision'])
        action_recalls.append(report['action']['recall'])
        action_f1s.append(report['action']['f1-score'])

        oppression_precisions.append(report['oppression']['precision'])
        oppression_recalls.append(report['oppression']['recall'])
        oppression_f1s.append(report['oppression']['f1-score'])

        culture_precisions.append(report['culture']['precision'])
        culture_recalls.append(report['culture']['recall'])
        culture_f1s.append(report['culture']['f1-score'])
        if i == 9:
            break

    print("none_precision:", get_avg(none_precisions))
    print("none_recalls:", get_avg(none_recalls))
    print("none_f1s:", get_avg(none_f1s))

    print("action_precisions:", get_avg(action_precisions))
    print("action_recalls:", get_avg(action_recalls))
    print("action_f1s:", get_avg(action_f1s))

    print("oppression_precisions:", get_avg(oppression_precisions))
    print("oppression_recalls:", get_avg(oppression_recalls))
    print("oppression_f1s:", get_avg(oppression_f1s))
    
    print("culture_precisions:", get_avg(culture_precisions))
    print("culture_recalls:", get_avg(culture_recalls))
    print("culture_f1s:", get_avg(culture_f1s))


        

----------------------Model: RandomForestClassifier----------------------
0th split...
(14122, 2)
(1570, 2)
1th split...
(14122, 2)
(1570, 2)
2th split...
(14123, 2)
(1569, 2)
3th split...
(14123, 2)
(1569, 2)
4th split...
(14123, 2)
(1569, 2)
5th split...
(14123, 2)
(1569, 2)
6th split...
(14123, 2)
(1569, 2)
7th split...
(14123, 2)
(1569, 2)
8th split...
(14123, 2)
(1569, 2)
9th split...
(14123, 2)
(1569, 2)
none_precision: 0.5272694437714727
none_recalls: 0.39816884769174843
none_f1s: 0.4534827923705098
action_precisions: 0.571725283715552
action_recalls: 0.6171599937684997
action_f1s: 0.5932850644590781
oppression_precisions: 0.6000683341410558
oppression_recalls: 0.555708313859895
oppression_f1s: 0.5767384552676533
culture_precisions: 0.49433507879505606
culture_recalls: 0.612274108116529
culture_f1s: 0.5467537416978621
----------------------Model: SVC----------------------
0th split...
(14122, 2)
(1570, 2)
1th split...
(14122, 2)
(1570, 2)
2th split...
(14123, 2)
(1569, 2)
3th sp

ValueError: ignored

In [23]:
# All the models that we are going to try
models = [
    # RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    # SVC(kernel='rbf'),#SVM
    MultinomialNB(),#Naive Bayes classifier for multinomial models
    LogisticRegression(random_state=0),
]

def get_avg(lst):
    return sum(lst) / len(lst)

for model in models:
    model_name = model.__class__.__name__
    print(f'----------------------Model: {model_name}----------------------')
    none_precisions = []
    none_recalls = []
    none_f1s = []

    action_precisions = []
    action_recalls = []
    action_f1s = []
    
    oppression_precisions = []
    oppression_recalls = []
    oppression_f1s = []

    culture_precisions = []
    culture_recalls = []
    culture_f1s = []

    var = 0
    n=10
    skf = StratifiedKFold(n_splits=n, random_state=1337, shuffle=True)
    confusions = []


    features = embed(df_oversample['text']).numpy()
    scaler = MinMaxScaler()
    features = scaler.fit_transform(features)


    labels = df_oversample['labels']


    for i, x in enumerate(skf.split(df_oversample['text'], df_oversample['labels'])):
        df_train = df_oversample.iloc[x[0].tolist()]
        df_test = df_oversample.iloc[x[1].tolist()]

        axis=1
        features_train = features[x[0]]
        features_test = features[x[1]]

        

        X_train = features_train
        y_train = df_train['labels']
        X_test = features_test
        y_test = df_test['labels']
        

        print(f'{i}th split...')
        print(df_train.shape)
        print(df_test.shape)



        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        report = metrics.classification_report(y_test, y_pred, target_names = ['none', 'oppression', 'action', 'culture'], output_dict=True)

        none_precisions.append(report['none']['precision'])
        none_recalls.append(report['none']['recall'])
        none_f1s.append(report['none']['f1-score'])

        action_precisions.append(report['action']['precision'])
        action_recalls.append(report['action']['recall'])
        action_f1s.append(report['action']['f1-score'])

        oppression_precisions.append(report['oppression']['precision'])
        oppression_recalls.append(report['oppression']['recall'])
        oppression_f1s.append(report['oppression']['f1-score'])

        culture_precisions.append(report['culture']['precision'])
        culture_recalls.append(report['culture']['recall'])
        culture_f1s.append(report['culture']['f1-score'])
        if i == 9:
            break

    print("none_precision:", get_avg(none_precisions))
    print("none_recalls:", get_avg(none_recalls))
    print("none_f1s:", get_avg(none_f1s))

    print("action_precisions:", get_avg(action_precisions))
    print("action_recalls:", get_avg(action_recalls))
    print("action_f1s:", get_avg(action_f1s))

    print("oppression_precisions:", get_avg(oppression_precisions))
    print("oppression_recalls:", get_avg(oppression_recalls))
    print("oppression_f1s:", get_avg(oppression_f1s))
    
    print("culture_precisions:", get_avg(culture_precisions))
    print("culture_recalls:", get_avg(culture_recalls))
    print("culture_f1s:", get_avg(culture_f1s))


        

----------------------Model: MultinomialNB----------------------
0th split...
(14122, 2)
(1570, 2)
1th split...
(14122, 2)
(1570, 2)
2th split...
(14123, 2)
(1569, 2)
3th split...
(14123, 2)
(1569, 2)
4th split...
(14123, 2)
(1569, 2)
5th split...
(14123, 2)
(1569, 2)
6th split...
(14123, 2)
(1569, 2)
7th split...
(14123, 2)
(1569, 2)
8th split...
(14123, 2)
(1569, 2)
9th split...
(14123, 2)
(1569, 2)
none_precision: 0.507412941501508
none_recalls: 0.4973172093264787
none_f1s: 0.502139330394911
action_precisions: 0.5624969148462153
action_recalls: 0.45220698966609546
action_f1s: 0.5010153789562093
oppression_precisions: 0.5726632204648519
oppression_recalls: 0.5065008827958666
oppression_f1s: 0.5373053874789291
culture_precisions: 0.4610305431818973
culture_recalls: 0.613301007425871
culture_f1s: 0.5262476853774649
----------------------Model: LogisticRegression----------------------
0th split...
(14122, 2)
(1570, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


1th split...
(14122, 2)
(1570, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


2th split...
(14123, 2)
(1569, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


3th split...
(14123, 2)
(1569, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


4th split...
(14123, 2)
(1569, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


5th split...
(14123, 2)
(1569, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


6th split...
(14123, 2)
(1569, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


7th split...
(14123, 2)
(1569, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


8th split...
(14123, 2)
(1569, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


9th split...
(14123, 2)
(1569, 2)
none_precision: 0.5724494104407564
none_recalls: 0.5077809368022018
none_f1s: 0.5380240593243094
action_precisions: 0.6603329723372608
action_recalls: 0.7280125408942202
action_f1s: 0.6923633753894347
oppression_precisions: 0.6504282802286683
oppression_recalls: 0.6436373786155684
oppression_f1s: 0.6466801494178893
culture_precisions: 0.6010323802714473
culture_recalls: 0.6127849613127693
culture_f1s: 0.6067248717320018


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### --- Archive ---

In [None]:
embeddings_index = {}
f = open('glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()


In [None]:
embeddings_index['king']

array([-0.32307 , -0.87616 ,  0.21977 ,  0.25268 ,  0.22976 ,  0.7388  ,
       -0.37954 , -0.35307 , -0.84369 , -1.1113  , -0.30266 ,  0.33178 ,
       -0.25113 ,  0.30448 , -0.077491, -0.89815 ,  0.092496, -1.1407  ,
       -0.58324 ,  0.66869 , -0.23122 , -0.95855 ,  0.28262 , -0.078848,
        0.75315 ,  0.26584 ,  0.3422  , -0.33949 ,  0.95608 ,  0.065641,
        0.45747 ,  0.39835 ,  0.57965 ,  0.39267 , -0.21851 ,  0.58795 ,
       -0.55999 ,  0.63368 , -0.043983, -0.68731 , -0.37841 ,  0.38026 ,
        0.61641 , -0.88269 , -0.12346 , -0.37928 , -0.38318 ,  0.23868 ,
        0.6685  , -0.43321 , -0.11065 ,  0.081723,  1.1569  ,  0.78958 ,
       -0.21223 , -2.3211  , -0.67806 ,  0.44561 ,  0.65707 ,  0.1045  ,
        0.46217 ,  0.19912 ,  0.25802 ,  0.057194,  0.53443 , -0.43133 ,
       -0.34311 ,  0.59789 , -0.58417 ,  0.068995,  0.23944 , -0.85181 ,
        0.30379 , -0.34177 , -0.25746 , -0.031101, -0.16285 ,  0.45169 ,
       -0.91627 ,  0.64521 ,  0.73281 , -0.22752 , 

In [None]:
result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)


KeyError: ignored

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas  as pd
from io import StringIO
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler #fixed import


In [None]:
# All the models that we are going to try
models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    
    MultinomialNB(),#Naive Bayes classifier for multinomial models
    LogisticRegression(random_state=0),
    SVC(kernel='rbf'),#SVM
]

def get_avg(lst):
    return sum(lst) / len(lst)

for model in models:
    model_name = model.__class__.__name__
    print(f'----------------------Model: {model_name}----------------------')
    none_precisions = []
    none_recalls = []
    none_f1s = []

    action_precisions = []
    action_recalls = []
    action_f1s = []
    
    oppression_precisions = []
    oppression_recalls = []
    oppression_f1s = []

    culture_precisions = []
    culture_recalls = []
    culture_f1s = []

    var = 0
    n=10
    skf = StratifiedKFold(n_splits=n, random_state=1337, shuffle=True)
    confusions = []



    labels = df_oversample['labels']


    for i, x in enumerate(skf.split(df_oversample['text'], df_oversample['labels'])):
        df_train = df_oversample.iloc[x[0].tolist()]
        df_test = df_oversample.iloc[x[1].tolist()]

        axis=1
        

        X_train = df_train['text']
        y_train = df_train['labels']
        X_test = df_test['text']
        y_test = df_test['labels']
        


        X_train_vect = np.array([np.array([word2vec.wv[i] for i in ls if i in words])
                            for ls in X_train])
        X_test_vect = np.array([np.array([word2vec.wv[i] for i in ls if i in words])
                            for ls in X_test])
        
        X_train_vect_avg = []
        for v in X_train_vect:
            if v.size:
                X_train_vect_avg.append(v.mean(axis=0))
            else:
                X_train_vect_avg.append(np.zeros(100, dtype=float))
                
        X_test_vect_avg = []
        for v in X_test_vect:
            if v.size:
                X_test_vect_avg.append(v.mean(axis=0))
            else:
                X_test_vect_avg.append(np.zeros(100, dtype=float))
                

        print(f'{i}th split...')
        print(df_train.shape)
        print(df_test.shape)


        scaler = MinMaxScaler()
        X_train_vect_avg = scaler.fit_transform(X_train_vect_avg)
        X_test_vect_avg = scaler.transform(X_test_vect_avg)

        model.fit(X_train_vect_avg, y_train)
        y_pred = model.predict(X_test_vect_avg)

        report = metrics.classification_report(y_test, y_pred, target_names = ['none', 'oppression', 'action', 'culture'], output_dict=True)

        none_precisions.append(report['none']['precision'])
        none_recalls.append(report['none']['recall'])
        none_f1s.append(report['none']['f1-score'])

        action_precisions.append(report['action']['precision'])
        action_recalls.append(report['action']['recall'])
        action_f1s.append(report['action']['f1-score'])

        oppression_precisions.append(report['oppression']['precision'])
        oppression_recalls.append(report['oppression']['recall'])
        oppression_f1s.append(report['oppression']['f1-score'])

        culture_precisions.append(report['culture']['precision'])
        culture_recalls.append(report['culture']['recall'])
        culture_f1s.append(report['culture']['f1-score'])
        if i == 9:
            break

    print("none_precision:", get_avg(none_precisions))
    print("none_recalls:", get_avg(none_recalls))
    print("none_f1s:", get_avg(none_f1s))

    print("action_precisions:", get_avg(action_precisions))
    print("action_recalls:", get_avg(action_recalls))
    print("action_f1s:", get_avg(action_f1s))

    print("oppression_precisions:", get_avg(oppression_precisions))
    print("oppression_recalls:", get_avg(oppression_recalls))
    print("oppression_f1s:", get_avg(oppression_f1s))
    
    print("culture_precisions:", get_avg(culture_precisions))
    print("culture_recalls:", get_avg(culture_recalls))
    print("culture_f1s:", get_avg(culture_f1s))


        

----------------------Model: RandomForestClassifier----------------------




0th split...
(14122, 2)
(1570, 2)




1th split...
(14122, 2)
(1570, 2)




2th split...
(14123, 2)
(1569, 2)




3th split...
(14123, 2)
(1569, 2)




4th split...
(14123, 2)
(1569, 2)




5th split...
(14123, 2)
(1569, 2)




6th split...
(14123, 2)
(1569, 2)




7th split...
(14123, 2)
(1569, 2)




8th split...
(14123, 2)
(1569, 2)




9th split...
(14123, 2)
(1569, 2)
none_precision: 0.375666305884105
none_recalls: 0.26714960793477693
none_f1s: 0.31198946828421986
action_precisions: 0.3629776020250902
action_recalls: 0.5980176039881602
action_f1s: 0.45139069256529635
oppression_precisions: 0.3381298459468766
oppression_recalls: 0.4266993820428935
oppression_f1s: 0.3769554691599
culture_precisions: 0.404565555757356
culture_recalls: 0.15242444305966663
culture_f1s: 0.2210025588308313
----------------------Model: MultinomialNB----------------------




0th split...
(14122, 2)
(1570, 2)




1th split...
(14122, 2)
(1570, 2)




2th split...
(14123, 2)
(1569, 2)




3th split...
(14123, 2)
(1569, 2)




4th split...
(14123, 2)
(1569, 2)




5th split...
(14123, 2)
(1569, 2)




6th split...
(14123, 2)
(1569, 2)




7th split...
(14123, 2)
(1569, 2)




8th split...
(14123, 2)
(1569, 2)




9th split...
(14123, 2)
(1569, 2)
none_precision: 0.28080976162860927
none_recalls: 0.4290056862439632
none_f1s: 0.3393596764019633
action_precisions: 0.30692990104235024
action_recalls: 0.506511917744197
action_f1s: 0.3821207990008501
oppression_precisions: 0.33744858330436367
oppression_recalls: 0.2158877291374565
oppression_f1s: 0.26282210182134425
culture_precisions: 0.35045431912879765
culture_recalls: 0.06322376278755777
culture_f1s: 0.1068855559374701
----------------------Model: LogisticRegression----------------------




0th split...
(14122, 2)
(1570, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


1th split...
(14122, 2)
(1570, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


2th split...
(14123, 2)
(1569, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


3th split...
(14123, 2)
(1569, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


4th split...
(14123, 2)
(1569, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


5th split...
(14123, 2)
(1569, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


6th split...
(14123, 2)
(1569, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


7th split...
(14123, 2)
(1569, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


8th split...
(14123, 2)
(1569, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


9th split...
(14123, 2)
(1569, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


none_precision: 0.28168089296784843
none_recalls: 0.2910837617489744
none_f1s: 0.28569607455444246
action_precisions: 0.3064842388134462
action_recalls: 0.49554447733291784
action_f1s: 0.3784948705527647
oppression_precisions: 0.3305471697989997
oppression_recalls: 0.3392720828789531
oppression_f1s: 0.3341603344310407
culture_precisions: 0.2859288848450718
culture_recalls: 0.09278574025029859
culture_f1s: 0.13966141012227187
----------------------Model: SVC----------------------




0th split...
(14122, 2)
(1570, 2)




1th split...
(14122, 2)
(1570, 2)


In [None]:
# Need to modify this code so that it will do 10 folds of when testing each model

import warnings
warnings.simplefilter("ignore")

models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    SVC(kernel='poly'),#SVM
    MultinomialNB(),#Naive Bayes classifier for multinomial models
    LogisticRegression(random_state=0),
]
CV = 10
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    print("Model...", model_name)
    accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        print("fold", fold_idx)
        entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

Model... RandomForestClassifier
fold 0
fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9
Model... LinearSVC
fold 0
fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9
Model... MultinomialNB
fold 0
fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9
Model... LogisticRegression
fold 0
fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9


In [None]:
# Check the accuracy
cv_df.groupby('model_name').accuracy.mean()

model_name
LinearSVC                 0.608857
LogisticRegression        0.631429
MultinomialNB             0.608000
RandomForestClassifier    0.560429
Name: accuracy, dtype: float64

In [None]:
model = RandomForestClassifier()
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.83      0.83      1245
           1       0.90      0.88      0.89      1320
           2       0.97      1.00      0.98      1308
           3       0.93      0.91      0.92      1306

    accuracy                           0.91      5179
   macro avg       0.90      0.90      0.90      5179
weighted avg       0.91      0.91      0.91      5179



In [None]:
report = metrics.classification_report(y_test, y_pred, target_names = ['none', 'oppression', 'action', 'culture'], output_dict=True)

report['none']['f1-score']

ValueError: ignored

In [None]:
model = MultinomialNB()
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.60      0.95      0.73      1309
           1       0.62      0.24      0.35       487
           2       0.00      0.00      0.00       141
           3       0.64      0.04      0.08       373

    accuracy                           0.60      2310
   macro avg       0.46      0.31      0.29      2310
weighted avg       0.57      0.60      0.50      2310



In [None]:
model = LogisticRegression()
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.64      0.89      0.75      1309
           1       0.55      0.38      0.45       487
           2       0.17      0.01      0.01       141
           3       0.51      0.19      0.28       373

    accuracy                           0.62      2310
   macro avg       0.47      0.37      0.37      2310
weighted avg       0.57      0.62      0.56      2310



In [None]:
model = LinearSVC()
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.68      0.78      0.72      1309
           1       0.49      0.49      0.49       487
           2       0.23      0.08      0.12       141
           3       0.42      0.30      0.35       373

    accuracy                           0.60      2310
   macro avg       0.45      0.41      0.42      2310
weighted avg       0.57      0.60      0.58      2310

