In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sn
import re
import nltk 
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
data_source_url = "https://raw.githubusercontent.com/kolaveridi/kaggle-Twitter-US-Airline-Sentiment-/master/Tweets.csv"
airline_tweets = pd.read_csv(data_source_url)

In [None]:
airline_tweets.head(10)

In [None]:
plot_size = plt.rcParams["figure.figsize"] 
print(plot_size[0]) 
print(plot_size[1])

plot_size[0] = 8
plot_size[1] = 6
plt.rcParams["figure.figsize"] = plot_size

In [None]:
airline_tweets.airline.value_counts().plot(kind='pie', autopct='%1.0f%%')

In [None]:
airline_tweets.airline_sentiment.value_counts().plot(kind='pie', autopct='%1.0f%%', colors=["red", "yellow", "green"])

In [None]:
airline_sentiment = airline_tweets.groupby(['airline', 'airline_sentiment']).airline_sentiment.count().unstack()
airline_sentiment.plot(kind='bar')

In [None]:
sn.barplot(x='airline_sentiment', y='airline_sentiment_confidence' , data=airline_tweets)

In [None]:
features = airline_tweets.iloc[:, 10].values
labels = airline_tweets.iloc[:, 1].values

In [None]:
processed_features = []

for sentence in range(0, len(features)):
    # Remove all the special characters
    processed_feature = re.sub(r'\W', ' ', str(features[sentence]))

    # remove all single characters
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

    # Remove single characters from the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 

    # Substituting multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

    # Removing prefixed 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)

    # Converting to Lowercase
    processed_feature = processed_feature.lower()

    processed_features.append(processed_feature)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(processed_features, labels, test_size=0.2, random_state=0)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
#Convert a collection of text documents to a matrix of token counts
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV

import itertools

from sklearn.pipeline import make_pipeline
from tqdm import tqdm

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

transformers = [CountVectorizer(), TfidfVectorizer()]
classifiers = [LogisticRegression(), MultinomialNB(), RandomForestClassifier()]
hyperparameters = {
    "RandomForestClassifier": {"n_estimators": [1, 10, 15], "max_depth": [1,3,10,20]},
    "LogisticRegression": {"C": [0.1, 0.3, 1]},
    "MultinomialNB": {}
}

results = {}

for transformer in transformers:
    transformer.fit(X_train, y_train)
    for classifier in classifiers:
        classifier_name = classifier.__class__.__name__
        gs_classifier = GridSearchCV(classifier, hyperparameters[classifier.__class__.__name__])
        pipeline = make_pipeline(transformer, gs_classifier)
        
        
        pipeline.fit(X_train, y_train)
        
        print(f"Best training params: ", gs_classifier.best_params_)
        print(f"Best training Score: ", gs_classifier.best_score_)
        print(f"Best training Score: ", gs_classifier.best_estimator_)
        
        best_classifier = gs_classifier.best_estimator_
        
        #y_pred      = pipeline.predict(X_train)
        y_pred_test = pipeline.predict(X_test)
        
        
        test_score = accuracy_score(y_test, y_pred_test)
        
        results[pipeline] = test_score
        
        #plot_confusion_matrix(y_test, y_pred_test, classes = classifier.classes_)
        cm = confusion_matrix(y_test, y_pred_test)  
        index = gs_classifier.classes_
        columns = gs_classifier.classes_
        cm_df = pd.DataFrame(cm,columns,index)                      
        plt.figure(figsize=(6,6))  
        ax = sn.heatmap(cm_df, annot=True, fmt="d", cmap = 'Reds')
        ax.set_ylim(3, -0.5)
        print("Full results from grid search: ", gs_classifier.cv_results_)
        print("Best results from grid search: ", gs_classifier.best_params_)

        print(f"Classifier: {classifier_name} Transformer: {transformer.__class__.__name__}")

        print(f"F1 score (micro): ", f1_score(y_test, y_pred_test, average='micro'))
        print(f"F1 score (macro): ", f1_score(y_test, y_pred_test, average='macro'))
        print(f"Accuracy score: ", accuracy_score(y_test, y_pred_test))
        
        plt.title(f"Classifier: {classifier_name} Transformer: {transformer.__class__.__name__} Score: {test_score}")
        plt.show()
        