In [1]:
from skmultilearn.model_selection import IterativeStratification
import pandas as pd 
import numpy as np
import nltk
from autocorrect import Speller
import ast
import re
from sklearn.metrics import accuracy_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier


nltk.download('punkt')
from nltk.tokenize import word_tokenize as wt

nltk.download('stopwords')
from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

[nltk_data] Downloading package punkt to
[nltk_data]     /users/grad/frahimi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /users/grad/frahimi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def bag_of_words(df_train, df_test):
    df_train['labels'] = df_train.apply(lambda row: ast.literal_eval(row['labels']), axis=1)
    df_test['labels'] = df_test.apply(lambda row: ast.literal_eval(row['labels']), axis=1)
    #lower case
    df_train.text = df_train.apply(lambda row: row.text.lower(), axis=1)
    df_test.text = df_test.apply(lambda row: row.text.lower(), axis =1)
    print("Lower case Done. ")

    # remove non alphabetic characters
    df_train.text = df_train.apply(lambda row: re.sub('[^A-Za-z]', ' ', row.text), axis=1)
    df_test.text = df_test.apply(lambda row: re.sub('[^A-Za-z]', ' ', row.text), axis=1)
    print("Non alphabetic characters Removed!")

    # tokenize
    df_train.text = df_train.apply(lambda row: wt(row.text), axis=1)
    df_test.text = df_test.apply(lambda row: wt(row.text), axis=1)
    print("Tokenize Done.")

    # remove stop words and stemming
    spell = Speller(lang='en')
    # df_train.text = df_train.apply(lambda row: [spell(stemmer.stem(word)) for word in row.text if word not in set(stopwords.words('english'))], axis=1)
    # df_test.text = df_test.apply(lambda row: [spell(stemmer.stem(word)) for word in row.text if word not in set(stopwords.words('english'))], axis=1)
    df_train.text = df_train.apply(lambda row: [stemmer.stem(word) for word in row.text if word not in set(stopwords.words('english'))], axis=1)
    df_test.text = df_test.apply(lambda row: [stemmer.stem(word) for word in row.text if word not in set(stopwords.words('english'))], axis=1)
    print("stop words + stemming Done.")

    # join
    df_train.text = df_train.apply(lambda row: " ".join(row.text), axis=1)
    df_test.text = df_test.apply(lambda row: " ".join(row.text), axis=1)
    print("join done!")

    # creating the feature matrix 
    matrix = CountVectorizer()
    len_train = len(df_train.text)
    X = matrix.fit_transform(df_train.text.to_list()+df_test.text.to_list()).toarray()
    X_train = X[:len_train,:]
    X_test = X[len_train:,:]
    y_train = df_train.labels.to_list()
    y_test = df_test.labels.to_list()

    model = RandomForestClassifier(max_depth=12, random_state=0)
    classifier = MultiOutputClassifier(model).fit(X_train, y_train)

    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1_micro = f1_score(y_test, y_pred,average='micro')
    f1_macro = f1_score(y_test, y_pred,average='macro')
    print("f1 micro", f1_micro)
    print("f1 macro", f1_macro)
    print("accuracy", accuracy)

In [7]:
def tfidf(df_train_orig, df_test):
    df_train_orig['labels'] = df_train_orig.apply(lambda row: ast.literal_eval(row['labels']), axis=1)
    df_test['labels'] = df_test.apply(lambda row: ast.literal_eval(row['labels']), axis=1)
    #lower case
    df_train_orig.text = df_train_orig.apply(lambda row: row.text.lower(), axis=1)
    df_test.text = df_test.apply(lambda row: row.text.lower(), axis =1)
    print("Lower case Done. ")

    # remove non alphabetic characters
    df_train_orig.text = df_train_orig.apply(lambda row: re.sub('[^A-Za-z]', ' ', row.text), axis=1)
    df_test.text = df_test.apply(lambda row: re.sub('[^A-Za-z]', ' ', row.text), axis=1)
    print("Non alphabetic characters Removed!")

    # tokenize
    df_train_orig.text = df_train_orig.apply(lambda row: wt(row.text), axis=1)
    df_test.text = df_test.apply(lambda row: wt(row.text), axis=1)
    print("Tokenize Done.")

    # remove stop words and stemming
    spell = Speller(lang='en')
    df_train_orig.text = df_train_orig.apply(lambda row: [stemmer.stem(word) for word in row.text if word not in set(stopwords.words('english'))], axis=1)
    df_test.text = df_test.apply(lambda row: [stemmer.stem(word) for word in row.text if word not in set(stopwords.words('english'))], axis=1)
    print("stop words + stemming Done.")

    # join
    df_train_orig.text = df_train_orig.apply(lambda row: " ".join(row.text), axis=1)
    df_test.text = df_test.apply(lambda row: " ".join(row.text), axis=1)
    print("join done!")


    # Building a TF IDF matrix 
    td = TfidfVectorizer(max_features = 8000)

    fold_i =0
    stratifier = IterativeStratification(n_splits=5, order=2)
    results = []
    for train_indexes, val_indexes in stratifier.split(df_train_orig['text'], np.array(df_train_orig['labels'].to_list())):
        fold_i += 1
        print(f"[dataset] Fold {fold_i}")

        train_df_cv = df_train_orig.iloc[train_indexes,:]
        val_df_cv = df_train_orig.iloc[val_indexes,:]

        len_train_df_cv = len(train_df_cv)
        len_val_df_cv = len(val_df_cv)

        X = td.fit_transform(train_df_cv.text.to_list()+val_df_cv.text.to_list()+df_test.text.to_list()).toarray()
        X_train = X[:len_train_df_cv,:]
        X_val   = X[len_train_df_cv:len_train_df_cv+len_val_df_cv,:]
        X_test  = X[len_train_df_cv+len_val_df_cv:,:]

        y_train = train_df_cv.labels.to_list()
        y_val   = val_df_cv.labels.to_list()
        y_test  = df_test.labels.to_list()

        # from sklearn.naive_bayes import MultinomialNB
        # model = MultinomialNB()

        model = RandomForestClassifier(max_depth=12, random_state=0)
        classifier = MultiOutputClassifier(model).fit(X_train, y_train)

        y_pred = classifier.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        f1_micro = f1_score(y_test, y_pred,average='micro')
        f1_macro = f1_score(y_test, y_pred,average='macro')

        results.append([f1_micro,f1_macro,accuracy])

    results = np.array(results)
    mean = np.mean(results, axis=0)
    std = np.std(results, axis=0)

    print("f1_micro= ", mean[0]*100," +- ", std[0]*100)
    print("f1_macro = ", mean[1]*100," +- ", std[1]*100)
    print("acc = ", mean[2]*100," +- ", std[2]*100)

In [8]:
print("OpenI:")
df_train = pd.read_csv("../data/OpenI/openI_train.csv") 
df_test = pd.read_csv("../data/OpenI/openI_test.csv")

# bag_of_words(df_train, df_test)
tfidf(df_train, df_test)


OpenI:
Lower case Done. 
Non alphabetic characters Removed!
Tokenize Done.
stop words + stemming Done.
join done!
[dataset] Fold 1
[dataset] Fold 2
[dataset] Fold 3
[dataset] Fold 4
[dataset] Fold 5
f1_micro=  29.17517727987029  +-  2.0516641772232913
f1_macro =  21.378596706379604  +-  1.0227448397779957
acc =  43.449367088607595  +-  1.1035187198837146


In [10]:
print("ohsumed:")

df_train = pd.read_csv("../data/ohsumed/ohsumed_train.csv") 
df_test = pd.read_csv("../data/ohsumed/ohsumed_test.csv")

# bag_of_words(df_train, df_test)
tfidf(df_train, df_test)

ohsumed:
Lower case Done. 
Non alphabetic characters Removed!
Tokenize Done.
stop words + stemming Done.
join done!
[dataset] Fold 1
[dataset] Fold 2
[dataset] Fold 3
[dataset] Fold 4
[dataset] Fold 5
f1_micro=  11.449937053354082  +-  0.40288040202325937
f1_macro =  5.667292817431572  +-  0.21391753806927774
acc =  5.409044343626518  +-  0.23132339889921547


In [None]:
print("reuters:")

df_train = pd.read_csv("../data/reuters/reuters_train.csv") 
df_test = pd.read_csv("../data/reuters/reuters_test.csv")

# bag_of_words(df_train, df_test)
tfidf(df_train, df_test)

In [None]:
print("twentynewsgroup:")

df_train = pd.read_csv("../data/twentynewsgroup/twentynewsgroup_train.csv") 
df_test = pd.read_csv("../data/twentynewsgroup/twentynewsgroup_test.csv")

# bag_of_words(df_train, df_test)
tfidf(df_train, df_test)