In [15]:
import pandas as pd 
import nltk
from autocorrect import Speller
import ast
import re
from sklearn.metrics import accuracy_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

nltk.download('punkt')
from nltk.tokenize import word_tokenize as wt

nltk.download('stopwords')
from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()



def bag_of_words(df_train, df_test):
    df_train['labels'] = df_train.apply(lambda row: ast.literal_eval(row['labels']), axis=1)
    df_test['labels'] = df_test.apply(lambda row: ast.literal_eval(row['labels']), axis=1)
    #lower case
    df_train.text = df_train.apply(lambda row: row.text.lower(), axis=1)
    df_test.text = df_test.apply(lambda row: row.text.lower(), axis =1)
    print("Lower case Done. ")

    # remove non alphabetic characters
    df_train.text = df_train.apply(lambda row: re.sub('[^A-Za-z]', ' ', row.text), axis=1)
    df_test.text = df_test.apply(lambda row: re.sub('[^A-Za-z]', ' ', row.text), axis=1)
    print("Non alphabetic characters Removed!")

    # tokenize
    df_train.text = df_train.apply(lambda row: wt(row.text), axis=1)
    df_test.text = df_test.apply(lambda row: wt(row.text), axis=1)
    print("Tokenize Done.")

    # remove stop words and stemming
    spell = Speller(lang='en')
    # df_train.text = df_train.apply(lambda row: [spell(stemmer.stem(word)) for word in row.text if word not in set(stopwords.words('english'))], axis=1)
    # df_test.text = df_test.apply(lambda row: [spell(stemmer.stem(word)) for word in row.text if word not in set(stopwords.words('english'))], axis=1)
    df_train.text = df_train.apply(lambda row: [stemmer.stem(word) for word in row.text if word not in set(stopwords.words('english'))], axis=1)
    df_test.text = df_test.apply(lambda row: [stemmer.stem(word) for word in row.text if word not in set(stopwords.words('english'))], axis=1)
    print("stop words + stemming Done.")

    # join
    df_train.text = df_train.apply(lambda row: " ".join(row.text), axis=1)
    df_test.text = df_test.apply(lambda row: " ".join(row.text), axis=1)
    print("join done!")

    # creating the feature matrix 
    matrix = CountVectorizer()
    len_train = len(df_train.text)
    X = matrix.fit_transform(df_train.text.to_list()+df_test.text.to_list()).toarray()
    X_train = X[:len_train,:]
    X_test = X[len_train:,:]
    y_train = df_train.labels.to_list()
    y_test = df_test.labels.to_list()

    model = RandomForestClassifier(max_depth=12, random_state=0)
    classifier = MultiOutputClassifier(model).fit(X_train, y_train)

    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1_micro = f1_score(y_test, y_pred,average='micro')
    f1_macro = f1_score(y_test, y_pred,average='macro')
    print("f1 micro", f1_micro)
    print("f1 macro", f1_macro)
    print("accuracy", accuracy)

[nltk_data] Downloading package punkt to
[nltk_data]     /users/grad/frahimi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /users/grad/frahimi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
print("OpenI:")
df_train = pd.read_csv("../data/OpenI/openI_train.csv") 
df_test = pd.read_csv("../data/OpenI/openI_test.csv")

bag_of_words(df_train, df_test)



OpenI:
Lower case Done. 
Non alphabetic characters Removed!
Tokenize Done.
stop words + stemming Done.
join done!
f1 micro 0.3810483870967742
f1 macro 0.28665912431705076
accuracy 0.4509493670886076


In [None]:
print("ohsumed:")

df_train = pd.read_csv("../data/ohsumed/ohsumed_train.csv") 
df_test = pd.read_csv("../data/ohsumed/ohsumed_test.csv")

bag_of_words(df_train, df_test)

In [None]:
print("reuters:")

df_train = pd.read_csv("../data/reuters/reuters_train.csv") 
df_test = pd.read_csv("../data/reuters/reuters_test.csv")

bag_of_words(df_train, df_test)

In [None]:
print("twentynewsgroup:")

df_train = pd.read_csv("../data/twentynewsgroup/twentynewsgroup_train.csv") 
df_test = pd.read_csv("../data/twentynewsgroup/twentynewsgroup_test.csv")

bag_of_words(df_train, df_test)