In [2]:

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.metrics import confusion_matrix
import itertools
import re
import nltk
from nltk import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import io
import warnings 

In [6]:
def evaluate(model_name, x_train, x_test, y_train, y_test):
    if model_name == 'Random Forest':
        model = RandomForestClassifier()
    elif model_name == 'Decision Tree':
        model = DecisionTreeClassifier()
    

    model.fit(x_train, y_train)
    predictions = model.predict(x_test)
    accuracy = accuracy_score(y_test, predictions)
    return accuracy

def cv_designer(X, min_df, max_features):
    cnt_vct = CountVectorizer(analyzer='word', min_df=min_df, max_features=max_features)
    X_cv = cnt_vct.fit_transform(X['text'])
    X = pd.DataFrame(X_cv.toarray(), columns=cnt_vct.get_feature_names())
    x_train = X[:50333]
    x_test = X[50333:]
    return x_train, x_test

def tf_designer(X, min_df, max_features):
    tf_vct = TfidfVectorizer(analyzer='word', min_df=min_df, max_features=max_features)
    X_tf = tf_vct.fit_transform(X['text'])
    X = pd.DataFrame(X_tf.toarray(), columns=tf_vct.get_feature_names())
    x_train = X[:50333]
    x_test = X[50333:]
    return x_train, x_test

def main():
    min_df = 5
    max_features = 500 # 50000
    model_names = ['Random Forest','Decision Tree']
    results = pd.DataFrame(columns=model_names, index=['CountVectorizer', 'TF-IDF'])

    train_data = pd.read_csv(r"C:/Users/User/Desktop/semeval/train.csv", on_bad_lines='skip', encoding="utf8" )
    test_data = pd.read_csv(r"C:/Users/User/Desktop/semeval/test.csv", on_bad_lines='skip', encoding="utf8")

    x_train = train_data.drop(['id', 'label'], axis=1)
    y_train = train_data['label']
    x_test = test_data.drop(['id', 'label'], axis=1)
    y_test = test_data['label']
 

    X = x_train.append(x_test, ignore_index=True)

    for model_name in model_names:
        x_train, x_test = cv_designer(X, min_df, max_features)
        results.loc['CountVectorizer'][model_name] = '{:.2f}'.format(evaluate(model_name, x_train, x_test, y_train, y_test) * 100)
        x_train, x_test = tf_designer(X, min_df, max_features)
        results.loc['TF-IDF'][model_name] = '{:.2f}'.format(evaluate(model_name, x_train, x_test, y_train, y_test) * 100)
        print('Done: ', model_name)

    results.to_csv('machine_learning_models_results.csv')
    print(results)

main()

Done:  Random Forest
Done:  Decision Tree
                Random Forest Decision Tree
CountVectorizer         48.88         46.17
TF-IDF                  49.07         46.96
