In [1]:
from skmultilearn.model_selection import IterativeStratification
import pandas as pd 
import numpy as np
import nltk
from autocorrect import Speller
import ast
import re
from sklearn.metrics import accuracy_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
import string

nltk.download('punkt')
from nltk.tokenize import word_tokenize as wt

nltk.download('stopwords')
from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

symbols = ['.',',',':','?','!','<','>','(',')','#','--','-','$','@','%','``',';',"''"]

[nltk_data] Downloading package punkt to
[nltk_data]     /users/grad/frahimi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /users/grad/frahimi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:

def bag_of_words(df_train_orig, df_test):
    df_train_orig['labels'] = df_train_orig.apply(lambda row: ast.literal_eval(row['labels']), axis=1)
    df_test['labels'] = df_test.apply(lambda row: ast.literal_eval(row['labels']), axis=1)

    df_train_orig.text = df_train_orig.text.apply(preprocess)
    df_test.text = df_test.text.apply(preprocess)

    c_vect = CountVectorizer(max_df=0.8, min_df=10, max_features=40000, ngram_range=(1,1),
                        lowercase=False)
    fold_i =0
    stratifier = IterativeStratification(n_splits=5, order=2)
    results = []
    for train_indexes, val_indexes in stratifier.split(df_train_orig['text'], np.array(df_train_orig['labels'].to_list())):
        fold_i += 1
        print(f"[dataset] Fold {fold_i}")

        train_df_cv = df_train_orig.iloc[train_indexes,:]
        val_df_cv = df_train_orig.iloc[val_indexes,:]

        len_train_df_cv = len(train_df_cv)
        len_val_df_cv = len(val_df_cv)

        X = c_vect.fit_transform(train_df_cv.text.to_list()+val_df_cv.text.to_list()+df_test.text.to_list()).toarray()
        X_train = X[:len_train_df_cv,:]
        X_val   = X[len_train_df_cv:len_train_df_cv+len_val_df_cv,:]
        X_test  = X[len_train_df_cv+len_val_df_cv:,:]

        y_train = train_df_cv.labels.to_list()
        y_val   = val_df_cv.labels.to_list()
        y_test  = df_test.labels.to_list()

        # from sklearn.naive_bayes import MultinomialNB
        # model = MultinomialNB()

        # model = RandomForestClassifier(max_depth=12, random_state=0)
        model = RandomForestClassifier(n_estimators=30, max_depth=500,
                               min_samples_split=2,
                               min_samples_leaf=1, max_leaf_nodes=None,
                               class_weight='balanced')
        classifier = MultiOutputClassifier(model).fit(X_train, y_train)

        y_pred = classifier.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        f1_micro = f1_score(y_test, y_pred,average='micro')
        f1_macro = f1_score(y_test, y_pred,average='macro')

        results.append([f1_micro,f1_macro,accuracy])

    results = np.array(results)
    mean = np.mean(results, axis=0)
    std = np.std(results, axis=0)

    print("f1_micro= ", mean[0]*100," +- ", std[0]*100)
    print("f1_macro = ", mean[1]*100," +- ", std[1]*100)
    print("acc = ", mean[2]*100," +- ", std[2]*100)

In [4]:
def preprocess(row):
    # lower case
    row = row.lower()

    #remove symbols
    row = row.translate(str.maketrans('', '', string.punctuation))

    #remove non alphabetic 
    row = re.sub('[^A-Za-z]', ' ', row)

    #tokenize
    row = wt(row)
    
    #remove words with less than 3 characters
    row = [word for word in row if len(word)>2]

    # remove stop words and stemming
    # row = [stemmer.stem(word) for word in row if word not in set(stopwords.words('english'))]
    row = [word for word in row if word not in set(stopwords.words('english'))]

    #if a word appears too much remove it

    #join
    row = " ".join(row)

    return row

def tfidf(df_train_orig, df_test):
    df_train_orig['labels'] = df_train_orig.apply(lambda row: ast.literal_eval(row['labels']), axis=1)
    df_test['labels'] = df_test.apply(lambda row: ast.literal_eval(row['labels']), axis=1)

    df_train_orig.text = df_train_orig.text.apply(preprocess)
    df_test.text = df_test.text.apply(preprocess)

    # Building a TF IDF matrix 
    # td = TfidfVectorizer(max_features = 8000) 
    td = TfidfVectorizer(max_df=0.8, min_df=10, max_features=40000, ngram_range=(1,1),
                        lowercase=False)
    fold_i =0
    stratifier = IterativeStratification(n_splits=5, order=2)
    results = []
    for train_indexes, val_indexes in stratifier.split(df_train_orig['text'], np.array(df_train_orig['labels'].to_list())):
        fold_i += 1
        print(f"[dataset] Fold {fold_i}")

        train_df_cv = df_train_orig.iloc[train_indexes,:]
        val_df_cv = df_train_orig.iloc[val_indexes,:]

        len_train_df_cv = len(train_df_cv)
        len_val_df_cv = len(val_df_cv)

        X = td.fit_transform(train_df_cv.text.to_list()+val_df_cv.text.to_list()+df_test.text.to_list()).toarray()
        X_train = X[:len_train_df_cv,:]
        X_val   = X[len_train_df_cv:len_train_df_cv+len_val_df_cv,:]
        X_test  = X[len_train_df_cv+len_val_df_cv:,:]

        y_train = train_df_cv.labels.to_list()
        y_val   = val_df_cv.labels.to_list()
        y_test  = df_test.labels.to_list()

        # from sklearn.naive_bayes import MultinomialNB
        # model = MultinomialNB()

        # model = RandomForestClassifier(max_depth=12, random_state=0)
        model = RandomForestClassifier(n_estimators=30, max_depth=500,
                               min_samples_split=2,
                               min_samples_leaf=1, max_leaf_nodes=None,
                               class_weight='balanced')
        classifier = MultiOutputClassifier(model).fit(X_train, y_train)

        y_pred = classifier.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        f1_micro = f1_score(y_test, y_pred,average='micro')
        f1_macro = f1_score(y_test, y_pred,average='macro')

        results.append([f1_micro,f1_macro,accuracy])

    results = np.array(results)
    mean = np.mean(results, axis=0)
    std = np.std(results, axis=0)

    print("f1_micro= ", mean[0]*100," +- ", std[0]*100)
    print("f1_macro = ", mean[1]*100," +- ", std[1]*100)
    print("acc = ", mean[2]*100," +- ", std[2]*100)

In [5]:
print("OpenI:")
df_train = pd.read_csv("../data/OpenI/openI_train.csv") 
df_test = pd.read_csv("../data/OpenI/openI_test.csv")

# bag_of_words(df_train, df_test)
tfidf(df_train, df_test)


OpenI:
[dataset] Fold 1
[dataset] Fold 2
[dataset] Fold 3
[dataset] Fold 4
[dataset] Fold 5
f1_micro=  61.870981095222746  +-  0.36375897147228514
f1_macro =  53.0950593976778  +-  0.8501006441709079
acc =  55.82278481012658  +-  0.44077178092355046


In [10]:
print("ohsumed:")

df_train = pd.read_csv("../data/ohsumed/ohsumed_train.csv") 
df_test = pd.read_csv("../data/ohsumed/ohsumed_test.csv")

# bag_of_words(df_train, df_test)
tfidf(df_train, df_test)

ohsumed:
Lower case Done. 
Non alphabetic characters Removed!
Tokenize Done.
stop words + stemming Done.
join done!
[dataset] Fold 1
[dataset] Fold 2
[dataset] Fold 3
[dataset] Fold 4
[dataset] Fold 5
f1_micro=  11.449937053354082  +-  0.40288040202325937
f1_macro =  5.667292817431572  +-  0.21391753806927774
acc =  5.409044343626518  +-  0.23132339889921547


In [None]:
print("reuters:")

df_train = pd.read_csv("../data/reuters/reuters_train.csv") 
df_test = pd.read_csv("../data/reuters/reuters_test.csv")

# bag_of_words(df_train, df_test)
tfidf(df_train, df_test)

In [7]:
print("twentynewsgroup:")

df_train = pd.read_csv("../data/twentynewsgroup/twentynewsgroup_train.csv") 
df_test = pd.read_csv("../data/twentynewsgroup/twentynewsgroup_test.csv")

# bag_of_words(df_train, df_test)
tfidf(df_train, df_test)

twentynewsgroup:
[dataset] Fold 1
[dataset] Fold 2
[dataset] Fold 3
[dataset] Fold 4
[dataset] Fold 5
f1_micro=  16.133115732204036  +-  0.34243139822645013
f1_macro =  15.089176770885116  +-  0.30258028943646686
acc =  8.685780817930432  +-  0.16501863231620867
