# Data Preprocessing

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import nltk
nltk.download()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d



True

In [0]:
import pandas as pd
import numpy as np
from nltk.tokenize import TweetTokenizer
from nltk.stem.snowball import SnowballStemmer as stem
from nltk.corpus import stopwords
import re
from sklearn.metrics.pairwise import cosine_similarity as cos_sim
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split

## Read Dataset

In [0]:
tweets = pd.read_csv('gdrive/My Drive/Datasets/Tweets.csv')

## Filteration

In [0]:
print("Number of samples before filteration : ", tweets.shape[0])

idx = []
for i,t in tweets['text'].iteritems():
  if ("RT" in t) or (len(t)<20):
    idx.append(i)
tweets = tweets.drop(idx)

print("Number of samples after filteration : ", tweets.shape[0])

Number of samples before filteration :  14640
Number of samples after filteration :  14365


In [0]:
text = tweets['text']
y = tweets['airline_sentiment']

## Normalize

In [0]:
stemmer = stem('english')
stop_words = set(stopwords.words('english'))
punct = ['.',',']
#Tokens = 
text = text.apply(lambda x: ' '.join([stemmer.stem(re.sub(r'http\S+', '', word.lower())) for word in TweetTokenizer(strip_handles=True).tokenize(x) if (word[0] not in punct and word.lower() not in stop_words)]))

# Classification

In [0]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

## Splitting data to training and testing

In [0]:
X_train, X_test, y_train, y_test = train_test_split(text, y, test_size=0.2, random_state=0)
scores = []

vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

## Multinomial Naive Bayes Classifier

In [0]:
clf = GridSearchCV(
    MultinomialNB(),
    param_grid = {'alpha': np.logspace(-2., 2., 50)})
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
scores.append(f1_score(y_test, y_pred, average="micro"))

print(clf.best_estimator_)

MultinomialNB(alpha=0.1151395399326447, class_prior=None, fit_prior=True)


## K Nearest Neighbors Classifier

In [0]:
neigh = GridSearchCV(
    KNeighborsClassifier(),
    param_grid = {'n_neighbors': [15, 20, 30 ,35, 40]})
neigh.fit(X_train, y_train)
y_pred = neigh.predict(X_test)
scores.append(f1_score(y_test, y_pred, average="micro"))

print(neigh.best_estimator_)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=40, p=2,
           weights='uniform')


## Random Forest Classifier

In [0]:
clf = RandomForestClassifier(n_estimators = 100, max_depth = 400, random_state = 0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
scores.append(f1_score(y_test, y_pred, average="micro"))

## Scores

In [0]:
print("Multinomial Naive Bayes Classifier F1 Score =", scores[0])
print("K Nearest Neighbors Classifier F1 Score =", scores[1])
print("Random Forest Classifier F1 Score =", scores[2])

Multinomial Naive Bayes Classifier F1 Score = 0.7473024712843718
K Nearest Neighbors Classifier F1 Score = 0.7229376957883745
Random Forest Classifier F1 Score = 0.7741037243299687
