## Basic Models (non-transformer-based)

In [1]:
!pip install demoji

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import gspread
import pandas as pd
from tensorflow import keras
import numpy as np
from google.colab import auth
auth.authenticate_user()
from google.auth import default
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, StratifiedKFold
import torch

creds, _ = default()
gc = gspread.authorize(creds)

In [3]:
worksheet = gc.open('USE DATASET').worksheet("Sheet1")
rows = worksheet.get_all_values()
df = pd.DataFrame.from_records(rows)
df.columns=df.iloc[0]
df = df.drop([0])
df = df[['sentence', 'final', 'before', 'after']] # keep only the sentence and class cols
df = df.dropna()
df = df[(df["final"] != "") & (df["final"] != "Undecided")]
df

Unnamed: 0,sentence,final,before,after
1,* 4 * When those Muslims came to attack Parlia...,oppression,* * When that Muslim When he came to attack 26...,* * Even when those Muslims were going to figh...
2,His sisters or sisters-in-law begin to force him.,none,So these Muslim friends start visiting the hom...,And how many such girls from Hindu families ar...
3,"” Entered, broke the Shivling into pieces, and...",culture,His scattered remains were surely nurturing th...,That property was estimated at two crore dirhams.
4,Muslims should change their instincts.,action,There is a lot of poison in the speech of Musl...,Muslims have got their desired thing in Pakist...
5,"They buy potato, onion, ginger etc.",none,There is cold storage for 👉 👉 keeping your o...,at very cheap prices directly from farmers dur...
...,...,...,...,...
6996,"More recently, you may have seen the ""Abu Bakr...",oppression,"From the ""Ganga-Jamuni Tehzeeb"", drunken Hindu...","Like the goats in Pakistan, Hindu girls would ..."
6997,* It is clear from these four examples how wro...,none,In this way he says that when the world adopts...,* The great prophet Savarkar * In 1952 Savarka...
6998,"In his view, atheist means one who does not be...",culture,The Islamic view of looking at the borders of ...,"Islam is not just a religion, in fact Islam is..."
6999,🚩 _ * * 🚩 🕉 🕉 ⚜ ⛳ ⚜ - 😬 Nurul Rahman Ba...,action,"We should not be 📲 📱 begging anyone, we know...",ूHindu cannot stop our Bangladeshi brothers fr...


In [4]:
df = df.sample(frac=1).reset_index(drop=True)
replacement = {
    'none': 0,
    'oppression': 1,
    'action': 2,
    'culture': 3
}

df['labels'] = df['final'].map(replacement)
df_keep = df[['sentence', 'labels']]
df_keep.columns = ["text", "labels"]
df_keep

Unnamed: 0,text,labels
0,Hari Singh Nalwa ji was the only warrior (not ...,0
1,The population of Jammu city has been around 7...,0
2,.. Namaz as per rules!,0
3,"And during this time, they looted people and b...",1
4,The Indonesia State Intelligence Agency said o...,0
...,...,...
6995,🚩 - Dr. Vivek Arya जनसंख्या On one side the p...,1
6996,🚩 _*Muslim Sterilisation ( Nasbandi) Compulsa...,0
6997,They united only for the goods they got in jihad.,3
6998,All his wishes which are being fulfilled.,0


## Combining Sentences & Oversampling

In [5]:
combs = []
labels = df['labels']

for i in range(len(df)):
    combs.append(df.loc[i].before + " " + df.loc[i].sentence)

df_combs = pd.DataFrame(data={
    'text': combs,
    'labels': labels
})
df_combs

Unnamed: 0,text,labels
0,"After this, all other Muslim men were also iss...",0
1,"4) According to the 2011 census, the populatio...",0
2,.. Office Hours Finished! .. Namaz as per rules!,0
3,निकालाAllah 🔹 pulled out the bankruptcy of sc...,1
4,They were the most horrific terrorist attacks ...,0
...,...,...
6995,"🚩 Therefore, population control laws in the c...",1
6996,http://www.quran.com/9/37. 🚩 _*Muslim Sterilis...,0
6997,"As their numbers grew, they also increased the...",3
6998,And what did the princess who wanted to do in ...,0


In [6]:
df_combs = df_combs.sample(frac=1).reset_index(drop=True)
df_combs.columns = ["text", "labels"]
df_combs

Unnamed: 0,text,labels
0,बचें 🔱 🚩 Avoid eating or consuming anything ...,1
1,In such a country 710 blocks and 196 districts...,0
2,The ISIS militants who attacked the café 🏽 👉 ...,0
3,You enter a restaurant in Bangladesh and ask p...,0
4,The office started with Ajan !! .. We read Aja...,0
...,...,...
6995,"Therefore, even today, despite being the large...",3
6996,"Kashmiri Pandit Sanjay Bahadur, who is living ...",0
6997,* * If Pakistan has to be cancer free then the...,2
6998,"Then, does the judge who decides the Supreme C...",0


In [7]:
# Oversampling the dataset
c0 = df_combs[df_combs['labels'] == 0]
c1 = df_combs[df_combs['labels'] == 1]
c2 = df_combs[df_combs['labels'] == 2]
c3 = df_combs[df_combs['labels'] == 3]

c1_over = c1.sample(len(c0), replace=True)
c2_over = c2.sample(len(c0), replace=True)
c3_over = c3.sample(len(c0), replace=True)

df_oversample = pd.concat([c0, c1_over, c2_over, c3_over], axis=0)
df_oversample['labels'].value_counts()

0    3923
1    3923
2    3923
3    3923
Name: labels, dtype: int64

## Dataset Cleaning

In [8]:
import re
import nltk
import demoji
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

stemmer = WordNetLemmatizer()


class Cleaner:
  def replace_emoji_and_dup(self, text):
    '''
    function takes in string `text` parameter and removes all duplicate characters in a row that are greater than 2 
    -replaces the emojis with their descriptions
    '''
    tracker = {}
    final = []

    # check if the past two characters were that same char

    for i in range(len(text)):
      if i < 2:
        final.append(text[i])
        continue
      if (text[i-1] == text[i]) and (text[i-2] == text[i]):
        pass
      else:
        final.append(text[i])

    final = "".join(final)
    return demoji.replace_with_desc(final)

  def replace_url(self, text):
    '''
    function takes a `string` parameter text and replaces all URLS with `_URL` 
    '''
    return re.sub(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)", ' _URL', text)


  def final_clean(self, text):
    text = re.sub(r'\W', ' ', str(text))
    
    # remove all single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    
    # Remove single characters from the start
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text) 
    
    # Substituting multiple spaces with single space
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    
    # Removing prefixed 'b'
    text = re.sub(r'^b\s+', '', text)
    
    # Converting to Lowercase
    text = text.lower()
    
    # Lemmatization
    text = text.split()

    text = [stemmer.lemmatize(word) for word in text]
    text = ' '.join(text)
    
    return text

  def clean_text(self, text):
    '''
    MASTER FUNCTION
    '''
    # remove emojis and duplicate chars
    clean1 = self.replace_emoji_and_dup(str(text))

    # replace links with _URL
    clean2 = self.replace_url(clean1)

    final = self.final_clean(clean2)

    return final

Cleaner = Cleaner()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [9]:
def prepreocess_text(text):
    return Cleaner.clean_text(text)

df_oversample['text'] = df_oversample['text'].apply(prepreocess_text)
df_oversample

Unnamed: 0,text,labels
1,in such country 710 block and 196 district hav...,0
2,the isi militant who attacked the café medium ...,0
3,you enter restaurant in bangladesh and ask peo...,0
4,the office started with ajan we read ajan and ...,0
6,on my very request 2 girl of delhi agreed to g...,0
...,...,...
1134,in addition in the name of the sachar commissi...,3
1405,according to him there is only one god in the ...,3
5207,but there are thousand of islamic study with t...,3
1778,islam doe not allow true muslim to consider in...,3


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas  as pd
from io import StringIO
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import train_test_split, cross_val_score

In [None]:
# All the models that we are going to try
models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    SVC(kernel='rbf'),#SVM
    MultinomialNB(),#Naive Bayes classifier for multinomial models
    LogisticRegression(random_state=0),
]

def get_avg(lst):
    return sum(lst) / len(lst)

for model in models:
    model_name = model.__class__.__name__
    print(f'----------------------Model: {model_name}----------------------')
    none_precisions = []
    none_recalls = []
    none_f1s = []

    action_precisions = []
    action_recalls = []
    action_f1s = []
    
    oppression_precisions = []
    oppression_recalls = []
    oppression_f1s = []

    culture_precisions = []
    culture_recalls = []
    culture_f1s = []

    var = 0
    n=10
    skf = StratifiedKFold(n_splits=n, random_state=1337, shuffle=True)
    confusions = []

    tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

    features = tfidf.fit_transform(df_oversample['text']).toarray()
    labels = df_oversample['labels']


    for i, x in enumerate(skf.split(df_oversample['text'], df_oversample['labels'])):
        df_train = df_oversample.iloc[x[0].tolist()]
        df_test = df_oversample.iloc[x[1].tolist()]

        axis=1
        features_train = features[x[0]]
        features_test = features[x[1]]

        

        X_train = features_train
        y_train = df_train['labels']
        X_test = features_test
        y_test = df_test['labels']
        

        print(f'{i}th split...')
        print(df_train.shape)
        print(df_test.shape)



        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        report = metrics.classification_report(y_test, y_pred, target_names = ['none', 'oppression', 'action', 'culture'], output_dict=True)

        none_precisions.append(report['none']['precision'])
        none_recalls.append(report['none']['recall'])
        none_f1s.append(report['none']['f1-score'])

        action_precisions.append(report['action']['precision'])
        action_recalls.append(report['action']['recall'])
        action_f1s.append(report['action']['f1-score'])

        oppression_precisions.append(report['oppression']['precision'])
        oppression_recalls.append(report['oppression']['recall'])
        oppression_f1s.append(report['oppression']['f1-score'])

        culture_precisions.append(report['culture']['precision'])
        culture_recalls.append(report['culture']['recall'])
        culture_f1s.append(report['culture']['f1-score'])
        if i == 9:
            break

    print("none_precision:", get_avg(none_precisions))
    print("none_recalls:", get_avg(none_recalls))
    print("none_f1s:", get_avg(none_f1s))

    print("action_precisions:", get_avg(action_precisions))
    print("action_recalls:", get_avg(action_recalls))
    print("action_f1s:", get_avg(action_f1s))

    print("oppression_precisions:", get_avg(oppression_precisions))
    print("oppression_recalls:", get_avg(oppression_recalls))
    print("oppression_f1s:", get_avg(oppression_f1s))
    
    print("culture_precisions:", get_avg(culture_precisions))
    print("culture_recalls:", get_avg(culture_recalls))
    print("culture_f1s:", get_avg(culture_f1s))


        

----------------------Model: RandomForestClassifier----------------------
0th split...
(14122, 2)
(1570, 2)
1th split...
(14122, 2)
(1570, 2)
2th split...
(14123, 2)
(1569, 2)
3th split...
(14123, 2)
(1569, 2)
4th split...
(14123, 2)
(1569, 2)
5th split...
(14123, 2)
(1569, 2)
6th split...
(14123, 2)
(1569, 2)
7th split...
(14123, 2)
(1569, 2)
8th split...
(14123, 2)
(1569, 2)
9th split...
(14123, 2)
(1569, 2)
none_precision: 0.5063552746650202
none_recalls: 0.5607830918626993
none_f1s: 0.5315522781020243
action_precisions: 0.653735787719007
action_recalls: 0.5707560886950199
action_f1s: 0.6087886537707528
oppression_precisions: 0.6495728978923007
oppression_recalls: 0.5330457755621333
oppression_f1s: 0.5843328392537848
culture_precisions: 0.5294039546195711
culture_recalls: 0.631643688009555
culture_f1s: 0.5756644816569775
----------------------Model: SVC----------------------
0th split...
(14122, 2)
(1570, 2)


In [None]:
# All the models that we are going to try
models = [
    MultinomialNB(),#Naive Bayes classifier for multinomial models
    LogisticRegression(random_state=0),
    SVC(kernel='rbf'),#SVM
]

def get_avg(lst):
    return sum(lst) / len(lst)

for model in models:
    model_name = model.__class__.__name__
    print(f'----------------------Model: {model_name}----------------------')
    none_precisions = []
    none_recalls = []
    none_f1s = []

    action_precisions = []
    action_recalls = []
    action_f1s = []
    
    oppression_precisions = []
    oppression_recalls = []
    oppression_f1s = []

    culture_precisions = []
    culture_recalls = []
    culture_f1s = []

    var = 0
    n=10
    skf = StratifiedKFold(n_splits=n, random_state=1337, shuffle=True)
    confusions = []

    tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

    features = tfidf.fit_transform(df_oversample['text']).toarray()
    labels = df_oversample['labels']


    for i, x in enumerate(skf.split(df_oversample['text'], df_oversample['labels'])):
        df_train = df_oversample.iloc[x[0].tolist()]
        df_test = df_oversample.iloc[x[1].tolist()]

        axis=1
        features_train = features[x[0]]
        features_test = features[x[1]]

        

        X_train = features_train
        y_train = df_train['labels']
        X_test = features_test
        y_test = df_test['labels']
        

        print(f'{i}th split...')
        print(df_train.shape)
        print(df_test.shape)



        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        report = metrics.classification_report(y_test, y_pred, target_names = ['none', 'oppression', 'action', 'culture'], output_dict=True)

        none_precisions.append(report['none']['precision'])
        none_recalls.append(report['none']['recall'])
        none_f1s.append(report['none']['f1-score'])

        action_precisions.append(report['action']['precision'])
        action_recalls.append(report['action']['recall'])
        action_f1s.append(report['action']['f1-score'])

        oppression_precisions.append(report['oppression']['precision'])
        oppression_recalls.append(report['oppression']['recall'])
        oppression_f1s.append(report['oppression']['f1-score'])

        culture_precisions.append(report['culture']['precision'])
        culture_recalls.append(report['culture']['recall'])
        culture_f1s.append(report['culture']['f1-score'])
        if i == 9:
            break

    print("none_precision:", get_avg(none_precisions))
    print("none_recalls:", get_avg(none_recalls))
    print("none_f1s:", get_avg(none_f1s))

    print("action_precisions:", get_avg(action_precisions))
    print("action_recalls:", get_avg(action_recalls))
    print("action_f1s:", get_avg(action_f1s))

    print("oppression_precisions:", get_avg(oppression_precisions))
    print("oppression_recalls:", get_avg(oppression_recalls))
    print("oppression_f1s:", get_avg(oppression_f1s))
    
    print("culture_precisions:", get_avg(culture_precisions))
    print("culture_recalls:", get_avg(culture_recalls))
    print("culture_f1s:", get_avg(culture_f1s))


        

----------------------Model: MultinomialNB----------------------
0th split...
(14122, 2)
(1570, 2)
1th split...
(14122, 2)
(1570, 2)
2th split...
(14123, 2)
(1569, 2)
3th split...
(14123, 2)
(1569, 2)
4th split...
(14123, 2)
(1569, 2)
5th split...
(14123, 2)
(1569, 2)
6th split...
(14123, 2)
(1569, 2)
7th split...
(14123, 2)
(1569, 2)
8th split...
(14123, 2)
(1569, 2)
9th split...
(14123, 2)
(1569, 2)
none_precision: 0.7029932455941605
none_recalls: 0.6703945318585449
none_f1s: 0.6861226131261879
action_precisions: 0.8969836784777993
action_recalls: 0.9691462844679857
action_f1s: 0.9316362319936703
oppression_precisions: 0.7907065618958431
oppression_recalls: 0.7833235966142181
oppression_f1s: 0.7868461163570505
culture_precisions: 0.8219336274326228
culture_recalls: 0.8004141351196967
culture_f1s: 0.8108859839897397
----------------------Model: LogisticRegression----------------------
0th split...
(14122, 2)
(1570, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


1th split...
(14122, 2)
(1570, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


2th split...
(14123, 2)
(1569, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


3th split...
(14123, 2)
(1569, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


4th split...
(14123, 2)
(1569, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


5th split...
(14123, 2)
(1569, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


6th split...
(14123, 2)
(1569, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


7th split...
(14123, 2)
(1569, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


8th split...
(14123, 2)
(1569, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


9th split...
(14123, 2)
(1569, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


none_precision: 0.8329566052421878
none_recalls: 0.6742204133561822
none_f1s: 0.7450091757900824
action_precisions: 0.9180184107180143
action_recalls: 0.9895447110141765
action_f1s: 0.9524257601240125
oppression_precisions: 0.8299443489556708
oppression_recalls: 0.8646323414862126
oppression_f1s: 0.8467993101536676
culture_precisions: 0.835476746089306
culture_recalls: 0.8939606376901905
culture_f1s: 0.8636909615231853
----------------------Model: SVC----------------------
0th split...
(14122, 2)
(1570, 2)


In [11]:
# All the models that we are going to try
models = [
    LinearSVC(),#SVM
]

def get_avg(lst):
    return sum(lst) / len(lst)

for model in models:
    model_name = model.__class__.__name__
    print(f'----------------------Model: {model_name}----------------------')
    none_precisions = []
    none_recalls = []
    none_f1s = []

    action_precisions = []
    action_recalls = []
    action_f1s = []
    
    oppression_precisions = []
    oppression_recalls = []
    oppression_f1s = []

    culture_precisions = []
    culture_recalls = []
    culture_f1s = []

    var = 0
    n=10
    skf = StratifiedKFold(n_splits=n, random_state=1337, shuffle=True)
    confusions = []

    tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

    features = tfidf.fit_transform(df_oversample['text']).toarray()
    labels = df_oversample['labels']


    for i, x in enumerate(skf.split(df_oversample['text'], df_oversample['labels'])):
        df_train = df_oversample.iloc[x[0].tolist()]
        df_test = df_oversample.iloc[x[1].tolist()]

        axis=1
        features_train = features[x[0]]
        features_test = features[x[1]]

        

        X_train = features_train
        y_train = df_train['labels']
        X_test = features_test
        y_test = df_test['labels']
        

        print(f'{i}th split...')
        print(df_train.shape)
        print(df_test.shape)



        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        report = metrics.classification_report(y_test, y_pred, target_names = ['none', 'oppression', 'action', 'culture'], output_dict=True)

        none_precisions.append(report['none']['precision'])
        none_recalls.append(report['none']['recall'])
        none_f1s.append(report['none']['f1-score'])

        action_precisions.append(report['action']['precision'])
        action_recalls.append(report['action']['recall'])
        action_f1s.append(report['action']['f1-score'])

        oppression_precisions.append(report['oppression']['precision'])
        oppression_recalls.append(report['oppression']['recall'])
        oppression_f1s.append(report['oppression']['f1-score'])

        culture_precisions.append(report['culture']['precision'])
        culture_recalls.append(report['culture']['recall'])
        culture_f1s.append(report['culture']['f1-score'])
        if i == 9:
            break

    print("none_precision:", get_avg(none_precisions))
    print("none_recalls:", get_avg(none_recalls))
    print("none_f1s:", get_avg(none_f1s))

    print("action_precisions:", get_avg(action_precisions))
    print("action_recalls:", get_avg(action_recalls))
    print("action_f1s:", get_avg(action_f1s))

    print("oppression_precisions:", get_avg(oppression_precisions))
    print("oppression_recalls:", get_avg(oppression_recalls))
    print("oppression_f1s:", get_avg(oppression_f1s))
    
    print("culture_precisions:", get_avg(culture_precisions))
    print("culture_recalls:", get_avg(culture_recalls))
    print("culture_f1s:", get_avg(culture_f1s))


        

----------------------Model: LinearSVC----------------------
0th split...
(14122, 2)
(1570, 2)
1th split...
(14122, 2)
(1570, 2)
2th split...
(14123, 2)
(1569, 2)
3th split...
(14123, 2)
(1569, 2)
4th split...
(14123, 2)
(1569, 2)
5th split...
(14123, 2)
(1569, 2)
6th split...
(14123, 2)
(1569, 2)
7th split...
(14123, 2)
(1569, 2)
8th split...
(14123, 2)
(1569, 2)
9th split...
(14123, 2)
(1569, 2)
none_precision: 0.9039828968664347
none_recalls: 0.6890111388066676
none_f1s: 0.7816575373289038
action_precisions: 0.9497265530857945
action_recalls: 0.999490445032975
action_f1s: 0.9739462785784694
oppression_precisions: 0.8490250765886982
oppression_recalls: 0.9225151892818195
oppression_f1s: 0.8840475457142507
culture_precisions: 0.8717874739795534
culture_recalls: 0.9566631614477854
culture_f1s: 0.9121609678193667


In [None]:
# Need to modify this code so that it will do 10 folds of when testing each model

import warnings
warnings.simplefilter("ignore")

models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    SVC(kernel='poly'),#SVM
    MultinomialNB(),#Naive Bayes classifier for multinomial models
    LogisticRegression(random_state=0),
]
CV = 10
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    print("Model...", model_name)
    accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        print("fold", fold_idx)
        entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

Model... RandomForestClassifier
fold 0
fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9
Model... LinearSVC
fold 0
fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9
Model... MultinomialNB
fold 0
fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9
Model... LogisticRegression
fold 0
fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9


In [None]:
# Check the accuracy
cv_df.groupby('model_name').accuracy.mean()

model_name
LinearSVC                 0.608857
LogisticRegression        0.631429
MultinomialNB             0.608000
RandomForestClassifier    0.560429
Name: accuracy, dtype: float64

In [None]:
model = RandomForestClassifier()
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.83      0.83      1245
           1       0.90      0.88      0.89      1320
           2       0.97      1.00      0.98      1308
           3       0.93      0.91      0.92      1306

    accuracy                           0.91      5179
   macro avg       0.90      0.90      0.90      5179
weighted avg       0.91      0.91      0.91      5179



In [None]:
model = MultinomialNB()
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.60      0.95      0.73      1309
           1       0.62      0.24      0.35       487
           2       0.00      0.00      0.00       141
           3       0.64      0.04      0.08       373

    accuracy                           0.60      2310
   macro avg       0.46      0.31      0.29      2310
weighted avg       0.57      0.60      0.50      2310



In [None]:
model = LogisticRegression()
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.64      0.89      0.75      1309
           1       0.55      0.38      0.45       487
           2       0.17      0.01      0.01       141
           3       0.51      0.19      0.28       373

    accuracy                           0.62      2310
   macro avg       0.47      0.37      0.37      2310
weighted avg       0.57      0.62      0.56      2310



In [None]:
model = LinearSVC()
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.68      0.78      0.72      1309
           1       0.49      0.49      0.49       487
           2       0.23      0.08      0.12       141
           3       0.42      0.30      0.35       373

    accuracy                           0.60      2310
   macro avg       0.45      0.41      0.42      2310
weighted avg       0.57      0.60      0.58      2310

