In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [None]:
# reading the excel file that has been formed by normalizing the text and removing the punctuations, numbers and stop words.
df = pd.read_excel('...xlsx')

In [None]:
# OPTIONAL: removing the texts which are classified neutral will increase the accuracy of the prediction
df = df[df.target !="Neutral"]
# dropping unnecessary columns
df.drop(["Unnamed: 0"], axis=1, inplace=True)
df

In [None]:
# see how many data we have for each class
df.groupby("target").nunique("fixed_text")

In [None]:
#dropping empty rows and saving to dataframe again
df.dropna(inplace=True)

In [None]:
# shows the columns of the dataframe and how many objects stored in the columns
df.info()

In [None]:
#after dropping the elements using dropna, reset the indexes
df.reset_index(inplace=True)

In [None]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,3))
tfidf_vectorizer.fit(df['fixed_text'])
tfidf_training_features = tfidf_vectorizer.transform(df['fixed_text'])

In [None]:
count_vectorizer = CountVectorizer(ngram_range=(1,3))
count_vectorizer.fit(df['fixed_text'])
count_training_features = count_vectorizer.transform(df['fixed_text'])

In [None]:
def knn_results(vectorized_matrix, df):
    X_train, X_test, Y_train, Y_test = train_test_split(vectorized_matrix,df['target'], test_size=0.2, random_state=0)
    knn = KNeighborsClassifier()
    #create a dictionary of all values we want to test for n_neighbors
    param_grid = {'n_neighbors': np.arange(1, 10)}
    #use gridsearch to test all values for n_neighbors
    knn_gscv = GridSearchCV(knn, param_grid, cv=5)
    #fit model to data
    knn_gscv.fit(vectorized_matrix, df['target'])
    #check top performing n_neighbors value
    optimal_k_value = knn_gscv.best_params_['n_neighbors']
    print('The optimal k value is: ' + str(optimal_k_value.item()))
    #check mean score for the top performing value of n_neighbors
    #print(knn_gscv.best_score_)
    classifier = KNeighborsClassifier(n_neighbors= optimal_k_value.item())
    classifier.fit(X_train,Y_train)
    y_pred = classifier.predict(X_test)
    print(f1_score(Y_test,y_pred, average='macro'))
    print(classification_report(Y_test,y_pred))
    print(accuracy_score(Y_test, y_pred))

In [None]:
#k-nn with using TF-IDF vectorization technique
knn_results(tfidf_training_features, df)

In [None]:
#k-nn with using TF(countvectorizer) vectorization technique
knn_results(count_training_features, df)