In [5]:
import numpy as np
import re
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder

In [6]:
# Read Data
df = pd.read_csv('training.1600000.processed.noemoticon.csv', header=None, encoding='latin')

df.columns = ['label', 'id', 'date', 'query', 'user', 'tweet']

# Data reduction
df = df.drop(['id', 'date', 'query', 'user'], axis=1)

In [None]:
labels_dict = {0:'Negative', 2:'Neutral', 4:'Positive'}

def convert_labels(label):
    return labels_dict[label]

df.label = df.label.apply(lambda x: convert_labels(x))
df

In [None]:
instances = df.label.value_counts()

plt.figure(figsize=(8,4))
plt.bar(instances.index, instances.values)
plt.title("Data Distribution")

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

stop_words = stopwords.words('english')
stemmer = SnowballStemmer('english')

punctuations_and_dummies = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"



'''
def preprocess(df, will_be_stemmed=False):
    for index, row in df.iterrows():
        tweet = row.tweet
        tweet = re.sub(punctuations_and_dummies, ' ', str(tweet).lower()).strip()
        tokens = []
        for token in tweet.split():
            if token not in stop_words:
                if will_be_stemmed:
                    tokens.append(stemmer.stem(token))
                else:
                    tokens.append(token)
        df.tweet = " ".join(tokens)


preprocess(df.tweet)
'''


def preprocess(tweet, will_be_stemmed=False):
        tweet = re.sub(punctuations_and_dummies, ' ', str(tweet).lower()).strip()
        tokens = []
        for token in tweet.split():
            if token not in stop_words:
                if will_be_stemmed:
                    tokens.append(stemmer.stem(token))
                else:
                    tokens.append(token)
        return " ".join(tokens)

df.tweet = df.tweet.apply(lambda tw: preprocess(tw))

In [10]:
# Remove 0 length tweets
df = df[df.iloc[:,1].astype(str).str.len()!=0]

In [None]:
tweets_len = [len(x) for x in df['tweet']]
pd.Series(tweets_len).hist()
plt.show()
pd.Series(tweets_len).describe()

In [12]:
all_str = ""
for i in df.tweet:
    all_str += i

In [None]:
from collections import Counter

letter_list = list(all_str)
my_counter = Counter(letter_list)

letter_df = pd.DataFrame.from_dict(my_counter, orient='index').reset_index()
letter_df = letter_df.rename(columns={'index':'letter', 0:'frequency'})
letter_df = letter_df.loc[letter_df['letter'].isin(['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z'])]
letter_df['all_tweets_relative_freq']=letter_df['frequency']/letter_df['frequency'].sum()
letter_df = letter_df.sort_values('letter')

english = pd.read_csv('letter_frequency_en_US.csv')
english['expected_relative_frequency'] = english['count']/english['count'].sum()
english = english.drop(['count'], axis=1)

letter_df = pd.merge(letter_df, english, on='letter')
letter_df['expected'] = np.round(letter_df['expected_relative_frequency']*letter_df['frequency'].sum(),0)
letter_df = letter_df.reset_index().drop(['index'], axis=1)
letter_df

In [None]:
letter_df.plot(x="letter", y=["all_tweets_relative_freq", "expected_relative_frequency"], kind="barh", figsize=(12,8))

In [None]:
from scipy.stats import chi2_contingency
# Chi-square test of independence.
c, p, dof, expected = chi2_contingency(letter_df[['frequency', 'expected']])
p

In [None]:
letter_df[['frequency', 'expected']].corr()

In [None]:
df1 = df.copy()

df1['number_of_characters'] = [len(tw) for tw in df1.tweet]
df1

In [None]:
df1.number_of_characters.max()

In [None]:
df1.number_of_characters.min()

In [None]:
df1.number_of_characters.mean()

In [None]:
df1.number_of_characters.std()

In [None]:
df1['number_of_words'] = [len(tw.split()) for tw in df1.tweet]
df1

In [None]:
df1.number_of_words.max()

In [None]:
df1.number_of_words.min()

In [None]:
df1.number_of_words.mean()

In [None]:
df1.number_of_words.std()

In [None]:
import collections
from wordcloud import WordCloud
from nltk import word_tokenize, sent_tokenize
from nltk.util import ngrams

all_tweets = ' '.join(df['tweet'].str.lower())

f_words = [word for word in all_tweets.split()]
counted_words = collections.Counter(f_words)

words = []
counts = []
for letter, count in counted_words.most_common(20):
    words.append(letter)
    counts.append(count)

plt.figure(figsize = (16, 4))
plt.title('Most common words in whole tweets')
plt.xlabel('Count')
plt.ylabel('Words')
plt.bar(words, counts)

In [None]:
all_tweets = ' '.join(df[df.label == 'Positive'].tweet.str.lower())

f_words = [word for word in all_tweets.split()]
counted_words = collections.Counter(f_words)

words = []
counts = []
for letter, count in counted_words.most_common(20):
    words.append(letter)
    counts.append(count)

plt.figure(figsize = (16, 4))
plt.title('Most common words in positive tweets')
plt.xlabel('Count')
plt.ylabel('Words')
plt.bar(words, counts)

In [None]:
plt.figure(figsize = (25, 25))
plt.axis('off')
wordcloud_fig = WordCloud(max_words = 2000 , width = 1600 , height = 800, background_color ='white', min_font_size = 10).generate(" ".join(df[df.label == 'Positive'].tweet))
plt.imshow(wordcloud_fig, interpolation = 'bilinear')

In [None]:
all_tweets = ' '.join(df[df.label == 'Negative'].tweet.str.lower())

f_words = [word for word in all_tweets.split()]
counted_words = collections.Counter(f_words)

words = []
counts = []
for letter, count in counted_words.most_common(20):
    words.append(letter)
    counts.append(count)

plt.figure(figsize = (16, 4))
plt.title('Most common words in negative tweets')
plt.xlabel('Count')
plt.ylabel('Words')
plt.bar(words, counts)

In [None]:
from wordcloud import WordCloud

plt.figure(figsize = (25, 25))
plt.axis('off')
wordcloud_fig = WordCloud(max_words = 2000 , width = 1600 , height = 800, background_color ='white', min_font_size = 10).generate(" ".join(df[df.label == 'Negative'].tweet))
plt.imshow(wordcloud_fig, interpolation = 'bilinear')

In [None]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(df, test_size=0.2, random_state=7)
print('Training Data', len(train_data), 'Test Data', len(test_data))

train_data.head(10)

In [None]:
test_data.head(10)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


from sklearn.ensemble import RandomForestClassifier


from sklearn.svm import SVC


# Splitting the data into train and test sets (70% train, 30% test)
train_data, test_data = train_test_split(train_data, test_size=0.3, random_state=42)

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data in the training set
X_train = tfidf_vectorizer.fit_transform(train_data['tweet'])  # Assuming 'tweet' is the text column

# # Initialize the classifier and fit it to the TF-IDF transformed data

clf = DecisionTreeClassifier()

# Initialize the Random Forest classifier
# clf = RandomForestClassifier()


# Initialize the SVM classifier
# clf = SVC()


clf.fit(X_train, train_data['label'])

# Transform the test data using the same vectorizer
X_test = tfidf_vectorizer.transform(test_data['tweet'])

# Make predictions on the test data

predictions = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(test_data['label'], predictions)

# Calculate recall, precision, and F1-score for the positive class
recall_positive = recall_score(test_data['label'], predictions, pos_label='Positive')
precision_positive = precision_score(test_data['label'], predictions, pos_label='Positive')
f1_positive = f1_score(test_data['label'], predictions, pos_label='Positive')

# Calculate recall, precision, and F1-score for the negative class
recall_negative = recall_score(test_data['label'], predictions, pos_label='Negative')
precision_negative = precision_score(test_data['label'], predictions, pos_label='Negative')
f1_negative = f1_score(test_data['label'], predictions, pos_label='Negative')



# Sensitivity is the same as recall in binary classification
sensitivity = recall_positive

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Recall (Positive class):", recall_positive)
print("Precision (Positive class):", precision_positive)
print("F1-Score (Positive class):", f1_positive)
print("Recall (Negative class):", recall_negative)
print("Precision (Negative class):", precision_negative)
print("F1-Score (Negative class):", f1_negative)
print("Sensitivity:", sensitivity)



In [None]:
from sklearn.metrics import confusion_matrix

# Calculate confusion matrix
conf_matrix = confusion_matrix(test_data['label'], predictions, labels=["Positive", "Negative"])

# Print the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


from sklearn.ensemble import RandomForestClassifier


from sklearn.svm import SVC


# Splitting the data into train and test sets (70% train, 30% test)
train_data, test_data = train_test_split(train_data, test_size=0.3, random_state=42)

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data in the training set
X_train = tfidf_vectorizer.fit_transform(train_data['tweet'])  # Assuming 'tweet' is the text column

# # Initialize the classifier and fit it to the TF-IDF transformed data

# clf = DecisionTreeClassifier()

# Initialize the Random Forest classifier
clf = RandomForestClassifier()


# Initialize the SVM classifier
# clf = SVC()


clf.fit(X_train, train_data['label'])

# Transform the test data using the same vectorizer
X_test = tfidf_vectorizer.transform(test_data['tweet'])

# Make predictions on the test data
predictions = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(test_data['label'], predictions)

# Calculate recall, precision, and F1-score for the positive class
recall_positive = recall_score(test_data['label'], predictions, pos_label='Positive')
precision_positive = precision_score(test_data['label'], predictions, pos_label='Positive')
f1_positive = f1_score(test_data['label'], predictions, pos_label='Positive')

# Calculate recall, precision, and F1-score for the negative class
recall_negative = recall_score(test_data['label'], predictions, pos_label='Negative')
precision_negative = precision_score(test_data['label'], predictions, pos_label='Negative')
f1_negative = f1_score(test_data['label'], predictions, pos_label='Negative')



# Sensitivity is the same as recall in binary classification
sensitivity = recall_positive

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Recall (Positive class):", recall_positive)
print("Precision (Positive class):", precision_positive)
print("F1-Score (Positive class):", f1_positive)
print("Recall (Negative class):", recall_negative)
print("Precision (Negative class):", precision_negative)
print("F1-Score (Negative class):", f1_negative)
print("Sensitivity:", sensitivity)



In [None]:
from sklearn.naive_bayes import MultinomialNB

# Initialize the Multinomial Naive Bayes classifier
clf = MultinomialNB()

# Fit the classifier to the TF-IDF transformed data
clf.fit(X_train, train_data['label'])

# Make predictions on the test data
predictions = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(test_data['label'], predictions)

# Calculate recall, precision, and F1-score for the positive class
recall_positive = recall_score(test_data['label'], predictions, pos_label='Positive')
precision_positive = precision_score(test_data['label'], predictions, pos_label='Positive')
f1_positive = f1_score(test_data['label'], predictions, pos_label='Positive')

# Calculate recall, precision, and F1-score for the negative class
recall_negative = recall_score(test_data['label'], predictions, pos_label='Negative')
precision_negative = precision_score(test_data['label'], predictions, pos_label='Negative')
f1_negative = f1_score(test_data['label'], predictions, pos_label='Negative')

# Sensitivity is the same as recall in binary classification
sensitivity = recall_positive

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Recall (Positive class):", recall_positive)
print("Precision (Positive class):", precision_positive)
print("F1-Score (Positive class):", f1_positive)
print("Recall (Negative class):", recall_negative)
print("Precision (Negative class):", precision_negative)
print("F1-Score (Negative class):", f1_negative)
print("Sensitivity:", sensitivity)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


from sklearn.ensemble import RandomForestClassifier


from sklearn.svm import SVC


# Splitting the data into train and test sets (70% train, 30% test)
train_data, test_data = train_test_split(train_data, test_size=0.3, random_state=42)

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data in the training set
X_train = tfidf_vectorizer.fit_transform(train_data['tweet'])  # Assuming 'tweet' is the text column

# Initialize the SVM classifier
clf = SVC()


clf.fit(X_train, train_data['label'])

# Transform the test data using the same vectorizer
X_test = tfidf_vectorizer.transform(test_data['tweet'])

# Make predictions on the test data
predictions = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(test_data['label'], predictions)

# Calculate recall, precision, and F1-score for the positive class
recall_positive = recall_score(test_data['label'], predictions, pos_label='Positive')
precision_positive = precision_score(test_data['label'], predictions, pos_label='Positive')
f1_positive = f1_score(test_data['label'], predictions, pos_label='Positive')

# Calculate recall, precision, and F1-score for the negative class
recall_negative = recall_score(test_data['label'], predictions, pos_label='Negative')
precision_negative = precision_score(test_data['label'], predictions, pos_label='Negative')
f1_negative = f1_score(test_data['label'], predictions, pos_label='Negative')



# Sensitivity is the same as recall in binary classification
sensitivity = recall_positive

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Recall (Positive class):", recall_positive)
print("Precision (Positive class):", precision_positive)
print("F1-Score (Positive class):", f1_positive)
print("Recall (Negative class):", recall_negative)
print("Precision (Negative class):", precision_negative)
print("F1-Score (Negative class):", f1_negative)
print("Sensitivity:", sensitivity)

In [None]:

import tensorflow as tf
print("TensorFlow version:", tf.__version__)

import tensorflow.keras as keras
print("Keras version:", keras.__version__)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data.tweet)
word_index = tokenizer.word_index
vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary Size :", vocab_size)


In [None]:
MODELS_PATH = 'models'
EMBEDDING_DIMENSION = 300

In [None]:
import tensorflow as tf

BATCH_SIZE = 1024
EPOCHS = 10
LR = 1e-3

embeddings_index = {}

glove_file = open('glove/glove.6B.300d.txt', encoding='utf8')
for line in glove_file:
    values = line.split()
    word = value = values[0]
    coefficients = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefficients
glove_file.close()

print('%s word vectors.' % len(embeddings_index))


embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIMENSION))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

embedding_layer = tf.keras.layers.Embedding(vocab_size, EMBEDDING_DIMENSION, weights=[embedding_matrix], input_length=30, trainable=False)