In [63]:
import pandas as pd
import time 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score


data = pd.read_csv('Tweets.csv')
data = data.drop('textID', axis=1)

In [64]:
selected_text = data.drop('text', axis=1)
selected_text.rename(columns = {'selected_text': 'text'}, inplace=True)
data1 = data.drop('selected_text', axis=1)
dataSet = pd.concat([selected_text, data1], axis=0)

dataSet.dropna(inplace=True)
dataSet.replace(['neutral', 'negative', 'positive'], [0, -1, 1], inplace=True)

In [65]:
dataSet = dataSet[dataSet['text'].apply(lambda x: isinstance(x, str))]
sentiment_column = dataSet['sentiment']

sentiment_column.reset_index(drop=True, inplace=True)
dataSet.reset_index(drop=True, inplace=True)

In [66]:
CountVect = CountVectorizer(lowercase=True)
decisionTree = DecisionTreeClassifier()

def TrainAndPredict(vectorizerType, classifierType, x_data, y_data):
    start = time.perf_counter()
    x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.20, random_state=52, shuffle=True)
    classifierType.fit(x_train, y_train)
    predictions = classifierType.predict(x_test)
    stop = time.perf_counter()
    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions, average='micro')
    matrix = confusion_matrix(y_test, predictions)
    print(f"\033[1;30;46m\tVectorizer: {vectorizerType}Classifier: {classifierType} Time: {stop-start:.4f}\033[0m\n{accuracy}\t{f1}\n{matrix}")

    return (accuracy, f1, matrix)

dataCount = CountVect.fit_transform(dataSet['text'])
data = pd.DataFrame.sparse.from_spmatrix(dataCount)
finalTable = pd.concat([data, sentiment_column], axis=1)

x_col = finalTable.columns[:-1]
y_col = finalTable.columns[-1]

bestAccuracy = -1
(acc, f1, matrix) = TrainAndPredict(CountVect, decisionTree, finalTable[x_col], finalTable[y_col])

model_parameters = []
bestAccuracy = acc
model_parameters.append((acc, CountVect))

[1;30;46m	Vectorizer: CountVectorizer()Classifier: DecisionTreeClassifier() Time: 75.3982[0m
0.8300582241630277	0.8300582241630278
[[2241  645  165]
 [ 172 4127  171]
 [ 272  443 2756]]


In [67]:
import pickle

pickle.dump(decisionTree, open('model.pkl', 'wb'))