## Contest

Grupa: Łukasz Tomaszewski, Maja Andrzejczuk, Mikołaj Piórczyński

In [3]:
import nltk                                # Python library for NLP
from nltk.corpus import twitter_samples    # sample Twitter dataset from NLTK
from nltk.corpus import stopwords
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split # function for splitting data to train and test sets

We can load the text fields of the positive and negative tweets by using the module's `strings()` method like this:

In [4]:
# downloads sample twitter dataset.
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/mikolajpiorczynski/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


True

In [94]:
# select the set of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

To make it easier, let us used pandas dataframe

In [95]:
# Create a dataframe from positive tweets
pos_tweet = pd.DataFrame(all_positive_tweets, columns=['Tweet'])
# Add a column to dataframe for positive sentiment value 1
pos_tweet['Sentiment'] = 1
# Create a temporary dataframe for negative tweets
neg_tweet = pd.DataFrame(all_negative_tweets, columns=['Tweet'])
# Add a column to temporary dataframe for negative sentiment value 0
neg_tweet['Sentiment'] = 0
# Combe positive and negative tweets in one single dataframe
#df = pos_tweet.append(neg_tweet, ignore_index=True)
df = pd.concat([neg_tweet, pos_tweet], ignore_index=True)
df = df.sample(frac = 1)
df.reset_index(drop=True, inplace=True)

In [96]:
df

Unnamed: 0,Tweet,Sentiment
0,@iMartyn Happy birthday! Didn't know wolves ha...,1
1,HP LOW DONG :((((((,0
2,@MahamK97 \n\nI was just tired yesterday \n\nC...,1
3,"YKR SPARKY\n40mm, 3.5 grs\nROCKFISHING, topwat...",1
4,@sunpandey It will be put up here as and when ...,1
...,...,...
9995,@sachapeebles_ sorry it was just closest to th...,0
9996,I'm finding its one of those days already! Hop...,1
9997,"@GrahamTownsend Oh yes, I have some of those s...",1
9998,@jobayeshopp @Sp1ns @storrmin571 @monticles @t...,1


Next, we'll print a report with the number of positive and negative tweets. It is also essential to know the data structure of the datasets

In [97]:
# Displaying shape of dataset
print('Dataset size:',df.shape)
df.groupby('Sentiment').count()

Dataset size: (10000, 2)


Unnamed: 0_level_0,Tweet
Sentiment,Unnamed: 1_level_1
0,5000
1,5000


In [98]:
# preprocessing
import re

def convert_to_lowercase(tweet):
    return tweet.lower()
 
def remove_rt(tweet):
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    return tweet
 
def remove_hyperlinks(tweet):
    tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)
    return tweet
 
def remove_hastag(tweet):
    tweet = re.sub(r'#', '', tweet)
    return tweet
 
def remove_mention(tweet):
    tweet = re.sub(r'@\w+', '', tweet)
    return tweet
 
df['Tweet'] = df['Tweet'].apply(lambda x: convert_to_lowercase(x))
df['Tweet'] = df['Tweet'].apply(lambda x: remove_rt(x))
df['Tweet'] = df['Tweet'].apply(lambda x: remove_hyperlinks(x))
df['Tweet'] = df['Tweet'].apply(lambda x: remove_hastag(x))
df['Tweet'] = df['Tweet'].apply(lambda x: remove_mention(x))

In [99]:
# Splitting the dataset into train and test set
train, test = train_test_split(df,test_size = 0.1)
train

Unnamed: 0,Tweet,Sentiment
2037,almost 645k views. watch muna while waiting na...,1
2519,i fell asleep arond 1:30 but it was really la...,1
7369,stats for the day have arrived. 1 new follower...,1
1231,everything was so much easier back then :(,0
8524,why have people unfaved and rted this :(,0
...,...,...
7417,offers you my mixtape. will you talk to me mo...,0
1486,woooo! happy friday friends :) ff,1
7789,ubericecream was super! but we didnt get the g...,0
5569,oh lovely lovelayyy! thanks! it is ok about th...,1


In [100]:
test

Unnamed: 0,Tweet,Sentiment
6578,see you there. i'll get the first round. :-),1
3154,kik me : smadvow544 kik kikgirl lgbt photo mod...,0
9411,niall followed a fan :) and i'm still here wit...,1
3850,shiiit :( so sorry,0
2786,sudden mood-dump :(,0
...,...,...
7545,the bittersweetness :(,0
651,i was waiting for a comeback about how gross...,1
1126,"oh i get it now. second quote tweet, but maybe...",1
7490,doesn't matter have to pay bill always.. fore...,1


In [101]:
nltk.download('stopwords')
stopwords_set = set(stopwords.words("english"))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mikolajpiorczynski/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [102]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix

def evaluate(true_labels, predicted_labels, log=True):
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels)
    recall = recall_score(true_labels, predicted_labels) 
    f1 = f1_score(true_labels, predicted_labels, average='weighted')
    
    if log:
        print("Accuracy:", accuracy)
        print("Precision: ", precision)
        print("Recall: ", recall)
        print("F1 Score: ", f1)
        cm = confusion_matrix(true_labels, predicted_labels)
        print("Confusion Matrix:\n", cm)

    return accuracy, precision, recall, f1


In [103]:
# Prepare the training data
X_train = train['Tweet'].tolist()
y_train = train['Sentiment'].tolist()

# Prepare the test data
X_test = test['Tweet'].tolist()
y_test = test['Sentiment'].tolist()

## Baselines

In [104]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

metrics = pd.DataFrame(
    columns=["Feature extractor class", "Classifier class", "Accuracy", "Precision", "Recall", "F1 Score"],
)

seed = 123
for feature_extractor_class in [CountVectorizer, TfidfVectorizer]:
    for classifier_class in [LogisticRegression, RandomForestClassifier, XGBClassifier]:
        print(f"Feature Extractor: {feature_extractor_class.__name__}, Classifier: {classifier_class.__name__}")
        # Create a CountVectorizer to convert text into numerical features
        vectorizer = feature_extractor_class(stop_words='english')

        # Fit and transform the training data
        X_train_vectorized = vectorizer.fit_transform(X_train)

        # Transform the test data (using the same vocabulary as the training data)
        X_test_vectorized = vectorizer.transform(X_test)

        # Train
        model = classifier_class(random_state=seed)
        model.fit(X_train_vectorized, y_train)

        # Predict
        y_pred = model.predict(X_test_vectorized)

        # Evaluate
        accuracy, precision, recall, f1 = evaluate(y_test, y_pred, log=False)
        metrics.loc[len(metrics)] = [feature_extractor_class.__name__, classifier_class.__name__, accuracy, precision, recall, f1]


Feature Extractor: CountVectorizer, Classifier: LogisticRegression
Feature Extractor: CountVectorizer, Classifier: RandomForestClassifier
Feature Extractor: CountVectorizer, Classifier: XGBClassifier
Feature Extractor: TfidfVectorizer, Classifier: LogisticRegression
Feature Extractor: TfidfVectorizer, Classifier: RandomForestClassifier
Feature Extractor: TfidfVectorizer, Classifier: XGBClassifier


In [105]:
metrics

Unnamed: 0,Feature extractor class,Classifier class,Accuracy,Precision,Recall,F1 Score
0,CountVectorizer,LogisticRegression,0.75,0.778271,0.700599,0.749399
1,CountVectorizer,RandomForestClassifier,0.739,0.76087,0.698603,0.738582
2,CountVectorizer,XGBClassifier,0.739,0.844828,0.586826,0.732829
3,TfidfVectorizer,LogisticRegression,0.755,0.781938,0.708583,0.754481
4,TfidfVectorizer,RandomForestClassifier,0.759,0.808057,0.680639,0.757525
5,TfidfVectorizer,XGBClassifier,0.733,0.830508,0.586826,0.727187


## Main solution

In [107]:
class DummyClassifier:
    def __init__(self):
        pass

    def fit(self, X, y):
        ...

    def predict(self, X):
        y_pred = X['Tweet'].apply(lambda x: 1 if ':)' in x or ':-)' in x or ':d' in x or ':p' in x or ': )' in x else 0)
        return y_pred

In [108]:
dummy_classifier = DummyClassifier()
# dummy_classifier.fit(X_train, y_train)
y_pred = dummy_classifier.predict(test)
# Evaluate
accuracy, precision, recall, f1 = evaluate(y_test, y_pred, log=False)
metrics.loc[len(metrics)] = [None, "DummyClassifier", accuracy, precision, recall, f1]

In [109]:
metrics

Unnamed: 0,Feature extractor class,Classifier class,Accuracy,Precision,Recall,F1 Score
0,CountVectorizer,LogisticRegression,0.75,0.778271,0.700599,0.749399
1,CountVectorizer,RandomForestClassifier,0.739,0.76087,0.698603,0.738582
2,CountVectorizer,XGBClassifier,0.739,0.844828,0.586826,0.732829
3,TfidfVectorizer,LogisticRegression,0.755,0.781938,0.708583,0.754481
4,TfidfVectorizer,RandomForestClassifier,0.759,0.808057,0.680639,0.757525
5,TfidfVectorizer,XGBClassifier,0.733,0.830508,0.586826,0.727187
6,,DummyClassifier,1.0,1.0,1.0,1.0
