In [None]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load the data
tweets_df = pd.read_csv('twitter.csv')
tweets_df
tweets_df.info()
tweets_df.describe()
tweets_df['tweet']

# Drop the 'id' column
tweets_df = tweets_df.drop(['id'], axis=1)

# checking null value
sns.heatmap(tweets_df.isnull(), yticklabels = False, cbar = False, cmap="Blues")

# number of positive tweetts vs negative tweets
sns.countplot(tweets_df['label'], label = "Count") 

#  getting the length of the messages
tweets_df['length'] = tweets_df['tweet'].apply(len)
tweets_df
tweets_df.describe()

# finding the shortest message 
tweets_df[tweets_df['length'] == 11]['tweet'].iloc[0]

# view the message with mean length 
tweets_df[tweets_df['length'] == 84]['tweet'].iloc[0]

# Plot the histogram of the length column
tweets_df['length'].plot(bins=100, kind='hist') 

# plot the worldcloud
positive = tweets_df[tweets_df['label']==0]
positive
negative = tweets_df[tweets_df['label']==1]
negative
sentences = tweets_df['tweet'].tolist()
len(sentences)
sentences_as_one_string =" ".join(sentences)
sentences_as_one_string

from wordcloud import WordCloud

plt.figure(figsize=(20,20))
plt.imshow(WordCloud().generate(sentences_as_one_string))


# creating a pipline to remove punctuations, stopwords and perform count vectorization
import string
from nltk.corpus import stopwords

def message_cleaning(message):
    Test_punc_removed = [char for char in message if char not in string.punctuation]
    Test_punc_removed_join = ''.join(Test_punc_removed)
    Test_punc_removed_join_clean = [word for word in Test_punc_removed_join.split() if word.lower() not in stopwords.words('english')]
    return Test_punc_removed_join_clean

# test the newly added function
tweets_df_clean = tweets_df['tweet'].apply(message_cleaning)

# cleaned up version
print(tweets_df_clean[5])

# original version
print(tweets_df['tweet'][5])

from sklearn.feature_extraction.text import CountVectorizer

# Define the cleaning pipeline we defined earlier
vectorizer = CountVectorizer(analyzer = message_cleaning, dtype = np.uint8)
tweets_countvectorizer = vectorizer.fit_transform(tweets_df['tweet'])
print(vectorizer.get_feature_names())

print(tweets_countvectorizer.toarray())  
tweets_countvectorizer.shape
X = pd.DataFrame(tweets_countvectorizer.toarray())

X
y = tweets_df['label']

# train and evaluate a naive bayes classifier model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
from sklearn.naive_bayes import MultinomialNB

NB_classifier = MultinomialNB()
NB_classifier.fit(X_train, y_train)

# evaluation
from sklearn.metrics import classification_report, confusion_matrix

# Predicting the Test set results
y_predict_test = NB_classifier.predict(X_test)
cm = confusion_matrix(y_test, y_predict_test)
sns.heatmap(cm, annot=True)
print(classification_report(y_test, y_predict_test))
