## Basic Models (non-transformer-based)

In [1]:
!pip install demoji

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting demoji
  Downloading demoji-1.1.0-py3-none-any.whl (42 kB)
[K     |████████████████████████████████| 42 kB 1.6 MB/s 
[?25hInstalling collected packages: demoji
Successfully installed demoji-1.1.0


In [2]:
import gspread
import pandas as pd
from tensorflow import keras
import numpy as np
from google.colab import auth
auth.authenticate_user()
from google.auth import default
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, StratifiedKFold
import torch

creds, _ = default()
gc = gspread.authorize(creds)

In [3]:
worksheet = gc.open('USE DATASET').worksheet("Sheet1")
rows = worksheet.get_all_values()
df = pd.DataFrame.from_records(rows)
df.columns=df.iloc[0]
df = df.drop([0])
df = df[['sentence', 'final', 'before', 'after']] # keep only the sentence and class cols
df = df.dropna()
df = df[(df["final"] != "") & (df["final"] != "Undecided")]
df

Unnamed: 0,sentence,final,before,after
1,* 4 * When those Muslims came to attack Parlia...,oppression,* * When that Muslim When he came to attack 26...,* * Even when those Muslims were going to figh...
2,His sisters or sisters-in-law begin to force him.,none,So these Muslim friends start visiting the hom...,And how many such girls from Hindu families ar...
3,"” Entered, broke the Shivling into pieces, and...",culture,His scattered remains were surely nurturing th...,That property was estimated at two crore dirhams.
4,Muslims should change their instincts.,action,There is a lot of poison in the speech of Musl...,Muslims have got their desired thing in Pakist...
5,"They buy potato, onion, ginger etc.",none,There is cold storage for 👉 👉 keeping your o...,at very cheap prices directly from farmers dur...
...,...,...,...,...
6996,"More recently, you may have seen the ""Abu Bakr...",oppression,"From the ""Ganga-Jamuni Tehzeeb"", drunken Hindu...","Like the goats in Pakistan, Hindu girls would ..."
6997,* It is clear from these four examples how wro...,none,In this way he says that when the world adopts...,* The great prophet Savarkar * In 1952 Savarka...
6998,"In his view, atheist means one who does not be...",culture,The Islamic view of looking at the borders of ...,"Islam is not just a religion, in fact Islam is..."
6999,🚩 _ * * 🚩 🕉 🕉 ⚜ ⛳ ⚜ - 😬 Nurul Rahman Ba...,action,"We should not be 📲 📱 begging anyone, we know...",ूHindu cannot stop our Bangladeshi brothers fr...


In [4]:
df = df.sample(frac=1).reset_index(drop=True)
replacement = {
    'none': 0,
    'oppression': 1,
    'action': 2,
    'culture': 3
}

df['labels'] = df['final'].map(replacement)
df_keep = df[['sentence', 'labels']]
df_keep.columns = ["text", "labels"]
df_keep

Unnamed: 0,text,labels
0,"Let them do it… First, tell me what is your jo...",2
1,"The * Hindu is scared *, the * Election Commis...",0
2,🔫⚔🤺🇮🇳🦁🚩 * Identification of Macaulay's 🚩 😎 📗 👉...,2
3,2.Allauddin Khilji called Rana Ratan Singh of ...,1
4,All I want is that every core BJP supporter no...,0
...,...,...
6995,"That is, nowhere does it say that the organiza...",0
6996,Immediately Virathu was declared communal and ...,0
6997,After that prostitute gave birth to a boy and ...,0
6998,"But you will often see Hindu politicians, film...",0


## Combining Sentences & Oversampling

In [5]:
combs = []
labels = df['labels']

for i in range(len(df)):
    combs.append(df.loc[i].before + " " + df.loc[i].sentence)

df_combs = pd.DataFrame(data={
    'text': combs,
    'labels': labels
})
df_combs

Unnamed: 0,text,labels
0,"This is their work…. Let them do it… First, te...",2
1,Terrorist attacks are continuously taking plac...,0
2,🚩 🕉⚜▂▄▅▆▇█🦁█▇▆▅▄▂⚜🕉 █▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀█ 🦁 ...,2
3,Muhammad Ghori vowed the Qur'an 17 times that ...,1
4,"All I want is safety, security of Hindus, Hind...",0
...,...,...
6995,Giving special counseling to non-Muslims in Te...,0
6996,There were also secular leaders. Immediately V...,0
6997,"Today, rape of one, tomorrow of another, in th...",0
6998,You will never find a Muslim politician worshi...,0


In [6]:
df_combs = df_combs.sample(frac=1).reset_index(drop=True)
df_combs.columns = ["text", "labels"]
df_combs

Unnamed: 0,text,labels
0,नया 🏻 🏻 🏻 🏻 👆 👆 👆 👆 🏻 👆 Instagram of...,2
1,"This is 19-year-old Fully Empowered, Intellect...",1
2,Only after that did Islam begin to break up. W...,0
3,"It is in proportion to that… And listen, Modi ...",0
4,Question- The only Dalit woman…? Answer- Mayaw...,0
...,...,...
6995,The Congress and Nehru did the work of breakin...,0
6996,Incident was done in which activists of 14 Hin...,1
6997,"Even if you are running on madrasa gel things,...",0
6998,"Today in our country, there is a crowd of thos...",0


In [7]:
# Oversampling the dataset
c0 = df_combs[df_combs['labels'] == 0]
c1 = df_combs[df_combs['labels'] == 1]
c2 = df_combs[df_combs['labels'] == 2]
c3 = df_combs[df_combs['labels'] == 3]

c1_over = c1.sample(len(c0), replace=True)
c2_over = c2.sample(len(c0), replace=True)
c3_over = c3.sample(len(c0), replace=True)

df_oversample = pd.concat([c0, c1_over, c2_over, c3_over], axis=0)
df_oversample['labels'].value_counts()

0    3923
1    3923
2    3923
3    3923
Name: labels, dtype: int64

## Dataset Cleaning

In [8]:
import re
import nltk
import demoji
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

stemmer = WordNetLemmatizer()


class Cleaner:
  def replace_emoji_and_dup(self, text):
    '''
    function takes in string `text` parameter and removes all duplicate characters in a row that are greater than 2 
    -replaces the emojis with their descriptions
    '''
    tracker = {}
    final = []

    # check if the past two characters were that same char

    for i in range(len(text)):
      if i < 2:
        final.append(text[i])
        continue
      if (text[i-1] == text[i]) and (text[i-2] == text[i]):
        pass
      else:
        final.append(text[i])

    final = "".join(final)
    return demoji.replace_with_desc(final)

  def replace_url(self, text):
    '''
    function takes a `string` parameter text and replaces all URLS with `_URL` 
    '''
    return re.sub(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)", ' _URL', text)


  def final_clean(self, text):
    text = re.sub(r'\W', ' ', str(text))
    
    # remove all single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    
    # Remove single characters from the start
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text) 
    
    # Substituting multiple spaces with single space
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    
    # Removing prefixed 'b'
    text = re.sub(r'^b\s+', '', text)
    
    # Converting to Lowercase
    text = text.lower()
    
    # Lemmatization
    text = text.split()

    text = [stemmer.lemmatize(word) for word in text]
    text = ' '.join(text)
    
    return text

  def clean_text(self, text):
    '''
    MASTER FUNCTION
    '''
    # remove emojis and duplicate chars
    clean1 = self.replace_emoji_and_dup(str(text))

    # replace links with _URL
    clean2 = self.replace_url(clean1)

    final = self.final_clean(clean2)

    return final

Cleaner = Cleaner()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [9]:
def prepreocess_text(text):
    return Cleaner.clean_text(text)

df_oversample['text'] = df_oversample['text'].apply(prepreocess_text)
df_oversample

Unnamed: 0,text,labels
2,only after that did islam begin to break up wh...,0
3,it is in proportion to that and listen modi an...,0
4,question the only dalit woman answer mayawati,0
6,21 the woman instigates muslim for khori and p...,0
7,ajgo hindu recognize congress conspiracy musli...,0
...,...,...
52,in fact the difference is of viewpoint it is p...,3
5431,you all will remember that if you are not forg...,3
113,thus al takaiyya ha protected the muslim from ...,3
1108,14 kalawa motor cycle muslim jawan should tie ...,3


In [10]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords

all_words = [nltk.word_tokenize(text) for text in df_oversample['text']]

# Removing Stop Words
from nltk.corpus import stopwords
for i in range(len(all_words)):
    all_words[i] = [w for w in all_words[i] if w not in stopwords.words('english')]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [12]:
from gensim.models import Word2Vec

word2vec = Word2Vec(all_words, size=100,
                                   window=5,
                                   min_count=2)

In [13]:
words = set(word2vec.wv.index2word )



In [14]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas  as pd
from io import StringIO
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler #fixed import


In [None]:
# All the models that we are going to try
models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    
    MultinomialNB(),#Naive Bayes classifier for multinomial models
    LogisticRegression(random_state=0),
    SVC(kernel='rbf'),#SVM
]

def get_avg(lst):
    return sum(lst) / len(lst)

for model in models:
    model_name = model.__class__.__name__
    print(f'----------------------Model: {model_name}----------------------')
    none_precisions = []
    none_recalls = []
    none_f1s = []

    action_precisions = []
    action_recalls = []
    action_f1s = []
    
    oppression_precisions = []
    oppression_recalls = []
    oppression_f1s = []

    culture_precisions = []
    culture_recalls = []
    culture_f1s = []

    var = 0
    n=10
    skf = StratifiedKFold(n_splits=n, random_state=1337, shuffle=True)
    confusions = []



    labels = df_oversample['labels']


    for i, x in enumerate(skf.split(df_oversample['text'], df_oversample['labels'])):
        df_train = df_oversample.iloc[x[0].tolist()]
        df_test = df_oversample.iloc[x[1].tolist()]

        axis=1
        

        X_train = df_train['text']
        y_train = df_train['labels']
        X_test = df_test['text']
        y_test = df_test['labels']
        


        X_train_vect = np.array([np.array([word2vec.wv[i] for i in ls if i in words])
                            for ls in X_train])
        X_test_vect = np.array([np.array([word2vec.wv[i] for i in ls if i in words])
                            for ls in X_test])
        
        X_train_vect_avg = []
        for v in X_train_vect:
            if v.size:
                X_train_vect_avg.append(v.mean(axis=0))
            else:
                X_train_vect_avg.append(np.zeros(100, dtype=float))
                
        X_test_vect_avg = []
        for v in X_test_vect:
            if v.size:
                X_test_vect_avg.append(v.mean(axis=0))
            else:
                X_test_vect_avg.append(np.zeros(100, dtype=float))
                

        print(f'{i}th split...')
        print(df_train.shape)
        print(df_test.shape)


        scaler = MinMaxScaler()
        X_train_vect_avg = scaler.fit_transform(X_train_vect_avg)
        X_test_vect_avg = scaler.transform(X_test_vect_avg)

        model.fit(X_train_vect_avg, y_train)
        y_pred = model.predict(X_test_vect_avg)

        report = metrics.classification_report(y_test, y_pred, target_names = ['none', 'oppression', 'action', 'culture'], output_dict=True)

        none_precisions.append(report['none']['precision'])
        none_recalls.append(report['none']['recall'])
        none_f1s.append(report['none']['f1-score'])

        action_precisions.append(report['action']['precision'])
        action_recalls.append(report['action']['recall'])
        action_f1s.append(report['action']['f1-score'])

        oppression_precisions.append(report['oppression']['precision'])
        oppression_recalls.append(report['oppression']['recall'])
        oppression_f1s.append(report['oppression']['f1-score'])

        culture_precisions.append(report['culture']['precision'])
        culture_recalls.append(report['culture']['recall'])
        culture_f1s.append(report['culture']['f1-score'])
        if i == 9:
            break

    print("none_precision:", get_avg(none_precisions))
    print("none_recalls:", get_avg(none_recalls))
    print("none_f1s:", get_avg(none_f1s))

    print("action_precisions:", get_avg(action_precisions))
    print("action_recalls:", get_avg(action_recalls))
    print("action_f1s:", get_avg(action_f1s))

    print("oppression_precisions:", get_avg(oppression_precisions))
    print("oppression_recalls:", get_avg(oppression_recalls))
    print("oppression_f1s:", get_avg(oppression_f1s))
    
    print("culture_precisions:", get_avg(culture_precisions))
    print("culture_recalls:", get_avg(culture_recalls))
    print("culture_f1s:", get_avg(culture_f1s))


        

----------------------Model: RandomForestClassifier----------------------




0th split...
(14122, 2)
(1570, 2)




1th split...
(14122, 2)
(1570, 2)




2th split...
(14123, 2)
(1569, 2)




3th split...
(14123, 2)
(1569, 2)




4th split...
(14123, 2)
(1569, 2)




5th split...
(14123, 2)
(1569, 2)




6th split...
(14123, 2)
(1569, 2)




7th split...
(14123, 2)
(1569, 2)




8th split...
(14123, 2)
(1569, 2)




9th split...
(14123, 2)
(1569, 2)
none_precision: 0.375666305884105
none_recalls: 0.26714960793477693
none_f1s: 0.31198946828421986
action_precisions: 0.3629776020250902
action_recalls: 0.5980176039881602
action_f1s: 0.45139069256529635
oppression_precisions: 0.3381298459468766
oppression_recalls: 0.4266993820428935
oppression_f1s: 0.3769554691599
culture_precisions: 0.404565555757356
culture_recalls: 0.15242444305966663
culture_f1s: 0.2210025588308313
----------------------Model: MultinomialNB----------------------




0th split...
(14122, 2)
(1570, 2)




1th split...
(14122, 2)
(1570, 2)




2th split...
(14123, 2)
(1569, 2)




3th split...
(14123, 2)
(1569, 2)




4th split...
(14123, 2)
(1569, 2)




5th split...
(14123, 2)
(1569, 2)




6th split...
(14123, 2)
(1569, 2)




7th split...
(14123, 2)
(1569, 2)




8th split...
(14123, 2)
(1569, 2)




9th split...
(14123, 2)
(1569, 2)
none_precision: 0.28080976162860927
none_recalls: 0.4290056862439632
none_f1s: 0.3393596764019633
action_precisions: 0.30692990104235024
action_recalls: 0.506511917744197
action_f1s: 0.3821207990008501
oppression_precisions: 0.33744858330436367
oppression_recalls: 0.2158877291374565
oppression_f1s: 0.26282210182134425
culture_precisions: 0.35045431912879765
culture_recalls: 0.06322376278755777
culture_f1s: 0.1068855559374701
----------------------Model: LogisticRegression----------------------




0th split...
(14122, 2)
(1570, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


1th split...
(14122, 2)
(1570, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


2th split...
(14123, 2)
(1569, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


3th split...
(14123, 2)
(1569, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


4th split...
(14123, 2)
(1569, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


5th split...
(14123, 2)
(1569, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


6th split...
(14123, 2)
(1569, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


7th split...
(14123, 2)
(1569, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


8th split...
(14123, 2)
(1569, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


9th split...
(14123, 2)
(1569, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


none_precision: 0.28168089296784843
none_recalls: 0.2910837617489744
none_f1s: 0.28569607455444246
action_precisions: 0.3064842388134462
action_recalls: 0.49554447733291784
action_f1s: 0.3784948705527647
oppression_precisions: 0.3305471697989997
oppression_recalls: 0.3392720828789531
oppression_f1s: 0.3341603344310407
culture_precisions: 0.2859288848450718
culture_recalls: 0.09278574025029859
culture_f1s: 0.13966141012227187
----------------------Model: SVC----------------------




0th split...
(14122, 2)
(1570, 2)




1th split...
(14122, 2)
(1570, 2)




2th split...
(14123, 2)
(1569, 2)




3th split...
(14123, 2)
(1569, 2)




4th split...
(14123, 2)
(1569, 2)




5th split...
(14123, 2)
(1569, 2)




6th split...
(14123, 2)
(1569, 2)




7th split...
(14123, 2)
(1569, 2)




8th split...
(14123, 2)
(1569, 2)




9th split...
(14123, 2)
(1569, 2)
none_precision: 0.3201578681104959
none_recalls: 0.3446389624552111
none_f1s: 0.33146589674337024
action_precisions: 0.3173105003614761
action_recalls: 0.6010898634262867
action_f1s: 0.41509865028088333
oppression_precisions: 0.36336176280693416
oppression_recalls: 0.2780800488134185
oppression_f1s: 0.31376742193066526
culture_precisions: 0.3930777030241006
culture_recalls: 0.10196162434439424
culture_f1s: 0.16169088488077504


In [15]:
# All the models that we are going to try
models = [
    LinearSVC(),#SVM
]

def get_avg(lst):
    return sum(lst) / len(lst)

for model in models:
    model_name = model.__class__.__name__
    print(f'----------------------Model: {model_name}----------------------')
    none_precisions = []
    none_recalls = []
    none_f1s = []

    action_precisions = []
    action_recalls = []
    action_f1s = []
    
    oppression_precisions = []
    oppression_recalls = []
    oppression_f1s = []

    culture_precisions = []
    culture_recalls = []
    culture_f1s = []

    var = 0
    n=10
    skf = StratifiedKFold(n_splits=n, random_state=1337, shuffle=True)
    confusions = []



    labels = df_oversample['labels']


    for i, x in enumerate(skf.split(df_oversample['text'], df_oversample['labels'])):
        df_train = df_oversample.iloc[x[0].tolist()]
        df_test = df_oversample.iloc[x[1].tolist()]

        axis=1
        

        X_train = df_train['text']
        y_train = df_train['labels']
        X_test = df_test['text']
        y_test = df_test['labels']
        


        X_train_vect = np.array([np.array([word2vec.wv[i] for i in ls if i in words])
                            for ls in X_train])
        X_test_vect = np.array([np.array([word2vec.wv[i] for i in ls if i in words])
                            for ls in X_test])
        
        X_train_vect_avg = []
        for v in X_train_vect:
            if v.size:
                X_train_vect_avg.append(v.mean(axis=0))
            else:
                X_train_vect_avg.append(np.zeros(100, dtype=float))
                
        X_test_vect_avg = []
        for v in X_test_vect:
            if v.size:
                X_test_vect_avg.append(v.mean(axis=0))
            else:
                X_test_vect_avg.append(np.zeros(100, dtype=float))
                

        print(f'{i}th split...')
        print(df_train.shape)
        print(df_test.shape)


        scaler = MinMaxScaler()
        X_train_vect_avg = scaler.fit_transform(X_train_vect_avg)
        X_test_vect_avg = scaler.transform(X_test_vect_avg)

        model.fit(X_train_vect_avg, y_train)
        y_pred = model.predict(X_test_vect_avg)

        report = metrics.classification_report(y_test, y_pred, target_names = ['none', 'oppression', 'action', 'culture'], output_dict=True)

        none_precisions.append(report['none']['precision'])
        none_recalls.append(report['none']['recall'])
        none_f1s.append(report['none']['f1-score'])

        action_precisions.append(report['action']['precision'])
        action_recalls.append(report['action']['recall'])
        action_f1s.append(report['action']['f1-score'])

        oppression_precisions.append(report['oppression']['precision'])
        oppression_recalls.append(report['oppression']['recall'])
        oppression_f1s.append(report['oppression']['f1-score'])

        culture_precisions.append(report['culture']['precision'])
        culture_recalls.append(report['culture']['recall'])
        culture_f1s.append(report['culture']['f1-score'])
        if i == 9:
            break

    print("none_precision:", get_avg(none_precisions))
    print("none_recalls:", get_avg(none_recalls))
    print("none_f1s:", get_avg(none_f1s))

    print("action_precisions:", get_avg(action_precisions))
    print("action_recalls:", get_avg(action_recalls))
    print("action_f1s:", get_avg(action_f1s))

    print("oppression_precisions:", get_avg(oppression_precisions))
    print("oppression_recalls:", get_avg(oppression_recalls))
    print("oppression_f1s:", get_avg(oppression_f1s))
    
    print("culture_precisions:", get_avg(culture_precisions))
    print("culture_recalls:", get_avg(culture_recalls))
    print("culture_f1s:", get_avg(culture_f1s))


        

----------------------Model: LinearSVC----------------------




0th split...
(14122, 2)
(1570, 2)




1th split...
(14122, 2)
(1570, 2)




2th split...
(14123, 2)
(1569, 2)




3th split...
(14123, 2)
(1569, 2)




4th split...
(14123, 2)
(1569, 2)




5th split...
(14123, 2)
(1569, 2)




6th split...
(14123, 2)
(1569, 2)




7th split...
(14123, 2)
(1569, 2)




8th split...
(14123, 2)
(1569, 2)




9th split...
(14123, 2)
(1569, 2)
none_precision: 0.2902637407625556
none_recalls: 0.2510827231656021
none_f1s: 0.26912414423190767
action_precisions: 0.3200750177918241
action_recalls: 0.5103306589811496
action_f1s: 0.3933149931931035
oppression_precisions: 0.3534112144382555
oppression_recalls: 0.3869404632081841
oppression_f1s: 0.369239448276164
culture_precisions: 0.29945183306812206
culture_recalls: 0.13230253933634523
culture_f1s: 0.1832107538037831




In [None]:
# Need to modify this code so that it will do 10 folds of when testing each model

import warnings
warnings.simplefilter("ignore")

models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    SVC(kernel='poly'),#SVM
    MultinomialNB(),#Naive Bayes classifier for multinomial models
    LogisticRegression(random_state=0),
]
CV = 10
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    print("Model...", model_name)
    accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        print("fold", fold_idx)
        entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

Model... RandomForestClassifier
fold 0
fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9
Model... LinearSVC
fold 0
fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9
Model... MultinomialNB
fold 0
fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9
Model... LogisticRegression
fold 0
fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9


In [None]:
# Check the accuracy
cv_df.groupby('model_name').accuracy.mean()

model_name
LinearSVC                 0.608857
LogisticRegression        0.631429
MultinomialNB             0.608000
RandomForestClassifier    0.560429
Name: accuracy, dtype: float64

In [None]:
model = RandomForestClassifier()
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.83      0.83      1245
           1       0.90      0.88      0.89      1320
           2       0.97      1.00      0.98      1308
           3       0.93      0.91      0.92      1306

    accuracy                           0.91      5179
   macro avg       0.90      0.90      0.90      5179
weighted avg       0.91      0.91      0.91      5179



In [None]:
report = metrics.classification_report(y_test, y_pred, target_names = ['none', 'oppression', 'action', 'culture'], output_dict=True)

report['none']['f1-score']

ValueError: ignored

In [None]:
model = MultinomialNB()
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.60      0.95      0.73      1309
           1       0.62      0.24      0.35       487
           2       0.00      0.00      0.00       141
           3       0.64      0.04      0.08       373

    accuracy                           0.60      2310
   macro avg       0.46      0.31      0.29      2310
weighted avg       0.57      0.60      0.50      2310



In [None]:
model = LogisticRegression()
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.64      0.89      0.75      1309
           1       0.55      0.38      0.45       487
           2       0.17      0.01      0.01       141
           3       0.51      0.19      0.28       373

    accuracy                           0.62      2310
   macro avg       0.47      0.37      0.37      2310
weighted avg       0.57      0.62      0.56      2310



In [None]:
model = LinearSVC()
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.68      0.78      0.72      1309
           1       0.49      0.49      0.49       487
           2       0.23      0.08      0.12       141
           3       0.42      0.30      0.35       373

    accuracy                           0.60      2310
   macro avg       0.45      0.41      0.42      2310
weighted avg       0.57      0.60      0.58      2310

