In [1]:
representation = 'TFIDF'
size = 500
retrain = True
models = ['nb', 'lr', 'svm']
save = True
plot = ['table']

dataFile = "training.1600000.processed.noemoticon.csv"
makingCSV = False

if ('matrix' in plot or 'table' in plot or save) and not retrain:
    raise Exception('Must retrain to take other actions')

In [2]:
import sklearn.model_selection
import sklearn.naive_bayes
import sklearn.linear_model
import sklearn.svm
import sklearn.metrics
import pandas as pd

In [3]:
if retrain:
    if not makingCSV:
        from CleanTweetsScript import vectorize, clean
        (df, doubleVocab) = vectorize(clean(dataFile, size))

        print("return")
        print(df.shape)

    else:
        vectorize(clean(dataFile, size), makeCSV= True)
        df = pd.read_csv("cleaned_data.csv", index_col=0)
        print("csv")


    df = df[df.Label != 2]
    df = df[pd.notna(df.Label)]

    # 3:int((df.shape[1] - 3) / 2)

    bow_df = df.iloc[:, 3:int((df.shape[1] - 3) / 2 + 3)]
    bow_df = bow_df.reset_index(drop=True)

    tfidf_df = df.iloc[:, int((df.shape[1] - 3) / 2 + 3):df.shape[1]]
    tfidf_df = tfidf_df.reset_index(drop=True)

    sentiment = df.Label.reset_index(drop=True)

(   Label                                               Text  \
0     -1  @switchfoot http://twitpic.com/2y1zl - Awww, t...   
1     -1  is upset that he can't update his Facebook by ...   
2     -1  @Kenichan I dived many times for the ball. Man...   
3     -1    my whole body feels itchy and like its on fire    
4     -1  @nationwideclass no, it's not behaving at all....   
5      1       I LOVE @Health4UandPets u guys r the best!!    
6      1  im meeting up with one of my besties tonight! ...   
7      1  @DaRealSunisaKim Thanks for the Twitter add, S...   
8      1  Being sick can be really cheap when it hurts t...   
9      1    @LovesBrooklyn2 he has that effect on everyone    

                                             Cleaned  B: awww  B: bummer  \
0     awww bummer shoulda got david carr third day d        0          0   
1  upset updat facebook text might cri result sch...        0          0   
2    dive mani time ball manag save 50 rest go bound        1          0   
3

In [4]:
#x_train = x_test = y_train = y_test = nb = nb_pred = lr = lr_pred = svm = svm_pred = timeLog = None 
#timeLogList = []

def fit(rep, refit= True, models=['nb','lr','svm'], save= True, plot= ['table', 'matrix']):
    #global x_train, x_test, y_train, y_test, nb, nb_pred, lr, lr_pred, svm, svm_pred, timeLog, timeLogList

    print(rep)

    if refit == True:
        from datetime import datetime
        timeLogList = []
        print("Started at " + datetime.now().strftime("%H:%M:%S"))
        timeLogList.append("Started at " + datetime.now().strftime("%H:%M:%S"))

        if rep in ["BoW", "TFIDF"]:
            x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split \
                (bow_df if rep == "BoW" else tfidf_df, sentiment, train_size=0.7)
            print("Split at " + datetime.now().strftime("%H:%M:%S"))
            timeLogList.append("Split at " + datetime.now().strftime("%H:%M:%S"))

            if 'nb' in models:
                nb = sklearn.naive_bayes.MultinomialNB()
                nb.fit(x_train, y_train)
                nb_pred = nb.predict(x_test)
                print("Naive Bayes fit at " + datetime.now().strftime("%H:%M:%S"))
                timeLogList.append("Naive Bayes fit at " + datetime.now().strftime("%H:%M:%S"))

            if 'lr' in models:
                lr = sklearn.linear_model.LogisticRegression()
                lr.fit(x_train, y_train)
                lr_pred = lr.predict(x_test)
                print("Logistic Regression fit at " + datetime.now().strftime("%H:%M:%S"))
                timeLogList.append("Logistic Regression fit at " + datetime.now().strftime("%H:%M:%S"))

            if 'svm' in models:
                svm = sklearn.svm.SVC()
                svm.fit(x_train, y_train)
                svm_pred = svm.predict(x_test)
                print("Support Vector Machine fit at " + datetime.now().strftime("%H:%M:%S"))
                timeLogList.append("Support Vector Machine fit at " + datetime.now().strftime("%H:%M:%S"))
                timeLog = "\n".join(timeLogList)

        else:
            raise Exception("Variable \'model\' must be \'BoW\' or \'TFIDF\'.")
        
    def metrics(y_true, y_pred):
        accuracy = sklearn.metrics.accuracy_score(y_true, y_pred)
        precision = sklearn.metrics.precision_score(y_true, y_pred)
        recall = sklearn.metrics.recall_score(y_true, y_pred)
        f1 = sklearn.metrics.f1_score(y_true, y_pred)
        return accuracy, precision, recall, f1

    #for i in [x_train, x_test, y_train, y_test]:
    #    print(i)
    #    print()

    nb_conf_matrix = sklearn.metrics.confusion_matrix(y_test, nb_pred, labels=nb.classes_)
    nb_d = metrics(y_test, nb_pred)

    lr_conf_matrix = sklearn.metrics.confusion_matrix(y_test, lr_pred, labels=nb.classes_)
    lr_d = metrics(y_test, lr_pred)

    svm_conf_matrix = sklearn.metrics.confusion_matrix(y_test, svm_pred, labels=nb.classes_)
    svm_d = metrics(y_test, svm_pred)


    metrics_df = pd.DataFrame([nb_d, lr_d, svm_d], columns= ["Accuracy", "Precision", "Recall", "F1"], \
                            index= ["Naive Bayes", "Logistic Regression", "Support Vector Machine"])
    
    if save == True:
        from joblib import dump, load
        import os

        try:
            os.mkdir('models')
        except FileExistsError:
            pass

        def saveModel(model, name, size):
            try:
                os.mkdir(rf'models\{rep}{size}')
            except FileExistsError:
                pass
            #cd = os.path.dirname(__file__)
            cd = rf"C:\Users\marce\Project\models\{rep}{size}"
            newPath = os.path.join(cd, rf'Model_{rep}_{name}_{size}.pkl')
            dump(model, newPath)
            vocabPath = os.path.join(cd, rf'Vocab_{size}.pkl')
            dump(doubleVocab[rep], vocabPath)
            statsPath = os.path.join(cd, rf'Metrics_{rep}_{size}.html')
            metrics_df.to_html(statsPath)
            logPath = os.path.join(cd, rf'TimeLog_{rep}_{size}.txt')
            with open(logPath, 'w') as f:
                f.write(timeLog)


        for model in [(nb, 'NaiveBayes'), (lr, 'LogisticRegression'), (svm, 'SupportVectorMachine')]:
            saveModel(model[0], model[1], size)
        
    if 'table' in plot:
        
        display(metrics_df)

    if 'matrix' in plot:
        nb_disp = sklearn.metrics.ConfusionMatrixDisplay(nb_conf_matrix)
        nb_disp.plot()
        nb_disp.ax_.set_title("Naive Bayes")
        #print(f"Naive Bayes: {metrics(y_test, nb_pred)}")

        lr_disp = sklearn.metrics.ConfusionMatrixDisplay(lr_conf_matrix)
        lr_disp.plot()
        lr_disp.ax_.set_title("Logistic Regression")
        #print(f"Logistic Regression: {metrics(y_test, lr_pred)}")

        svm_disp = sklearn.metrics.ConfusionMatrixDisplay(svm_conf_matrix)
        svm_disp.plot()
        svm_disp.ax_.set_title("Support Vector Machine")
        #print(f"Support Vector Machine: {metrics(y_test, svm_pred)}")

In [5]:
if representation in ('BoW', 'TFIDF'):
    fit(representation, retrain, models, save, plot)
elif representation.lower() == 'both':
    for i in ('BoW', 'TFIDF'):
        fit(i, retrain, models, save, plot)
else:
    raise Exception("Selected representation is not in ['Bow', 'TFIDF', 'Both', 'Both']")

TFIDF
Started at 05:40:28
Split at 05:40:28
Naive Bayes fit at 05:40:28
Logistic Regression fit at 05:40:28
Support Vector Machine fit at 05:40:29


Unnamed: 0,Accuracy,Precision,Recall,F1
Naive Bayes,0.616667,0.56044,0.744526,0.639498
Logistic Regression,0.633333,0.583851,0.686131,0.630872
Support Vector Machine,0.596667,0.547619,0.671533,0.603279
