In [None]:
!pip install fasttext



In [None]:
import pandas as pd
import fasttext
import html

# Unprocessed Data

1. Prepare data according to the format required by fasttext.

In [None]:
df = pd.read_csv("tweet_emotions.csv")

df["sentiment"] = "__label__" + df["sentiment"]
df.drop("tweet_id", axis=1, inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  40000 non-null  object
 1   content    40000 non-null  object
dtypes: object(2)
memory usage: 625.1+ KB


In [None]:
df

Unnamed: 0,sentiment,content
0,__label__empty,@tiffanylue i know i was listenin to bad habi...
1,__label__sadness,Layin n bed with a headache ughhhh...waitin o...
2,__label__sadness,Funeral ceremony...gloomy friday...
3,__label__enthusiasm,wants to hang out with friends SOON!
4,__label__neutral,@dannycastillo We want to trade with someone w...
...,...,...
39995,__label__neutral,@JohnLloydTaylor
39996,__label__love,Happy Mothers Day All my love
39997,__label__love,Happy Mother's Day to all the mommies out ther...
39998,__label__happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...


In [None]:
df.to_csv(r"tweet_emotions.unprocessed.txt", header=None, index=None, sep=" ")

2. Split all tweets into two groups: the first 70% should go to the train part and another 30% – to the test.

In [None]:
!head -n 28000 tweet_emotions.unprocessed.txt > tweet_emotions.unprocessed.train
!tail -n 12000 tweet_emotions.unprocessed.txt > tweet_emotions.unprocessed.valid

3. Train classification model on train subset and predict on the test one

In [None]:
model = fasttext.train_supervised("tweet_emotions.unprocessed.train")

In [None]:
model.test("tweet_emotions.unprocessed.valid")

(12000, 0.31033333333333335, 0.31033333333333335)

4.  Calculate accuracy (percentage of guessed) emotions on test subset

    Accuracy = Precision = Recall 

    ~0.31

In [None]:
model.test_label("tweet_emotions.unprocessed.valid")

{'__label__anger': {'f1score': nan, 'precision': nan, 'recall': nan},
 '__label__boredom': {'f1score': nan, 'precision': nan, 'recall': nan},
 '__label__empty': {'f1score': nan, 'precision': nan, 'recall': nan},
 '__label__enthusiasm': {'f1score': nan, 'precision': nan, 'recall': nan},
 '__label__fun': {'f1score': nan, 'precision': nan, 'recall': nan},
 '__label__happiness': {'f1score': 0.6557514693534845,
  'precision': 0.32787573467674225,
  'recall': nan},
 '__label__hate': {'f1score': 1.5, 'precision': 0.75, 'recall': nan},
 '__label__love': {'f1score': 0.9502762430939227,
  'precision': 0.47513812154696133,
  'recall': nan},
 '__label__neutral': {'f1score': 0.5975844084545704,
  'precision': 0.2987922042272852,
  'recall': nan},
 '__label__relief': {'f1score': nan, 'precision': nan, 'recall': nan},
 '__label__sadness': {'f1score': 0.22580645161290322,
  'precision': 0.11290322580645161,
  'recall': nan},
 '__label__surprise': {'f1score': nan, 'precision': nan, 'recall': nan},
 '__

 5. Provide random examples: text / label / predicted label

In [None]:
#  happiness
model.predict("had SUCH and AMAZING time last night, McFly were INCREDIBLE")

(('__label__neutral',), array([0.2979269]))

In [None]:
# love
model.predict("@mopedronin bullet train from tokyo    the gf and i have been visiting japan since thursday  vacation/sightseeing    gaijin godzilla")

(('__label__neutral',), array([0.29117396]))

In [None]:
# fun
model.predict("good morning/midday nation!  FORMULA ONE IN ONE HOUR!")

(('__label__happiness',), array([0.38186407]))

In [None]:
# surprise
model.predict("@BuddingGenius you dont say")

(('__label__neutral',), array([0.8196938]))

In [None]:
# neutral
model.predict("revision...what fun...still I have thursday to do basically nothing")

(('__label__neutral',), array([0.46377569]))

# Processed Data

1. Prepare and clean data
    (remove what doesn't add value to the sentiment analysis)

In [None]:
# convert html
df["content"] =df["content"].apply(lambda x: html.unescape(x))
# remove @ users and links
df["content"] = df["content"].str.replace(r"@[\w]*|https?://[\w]*", "", regex=True)
# remove all punctuation
df["content"] = df["content"].str.replace(r"[^\w\s]", " ", regex=True)
# convert text to lowercase
df["content"] = df["content"].apply(lambda x: x.lower())

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  40000 non-null  object
 1   content    40000 non-null  object
dtypes: object(2)
memory usage: 625.1+ KB


In [None]:
df

Unnamed: 0,sentiment,content
0,__label__empty,i know i was listenin to bad habit earlier a...
1,__label__sadness,layin n bed with a headache ughhhh waitin o...
2,__label__sadness,funeral ceremony gloomy friday
3,__label__enthusiasm,wants to hang out with friends soon
4,__label__neutral,we want to trade with someone who has houston...
...,...,...
39995,__label__neutral,
39996,__label__love,happy mothers day all my love
39997,__label__love,happy mother s day to all the mommies out ther...
39998,__label__happiness,wassup beautiful follow me peep out my ...




In [None]:
df.to_csv(r"tweet_emotions.preprocessed.txt", header=None, index=None, sep=" ")

2. Split all tweets into two groups: the first 70% should go to the train part and another 30% – to the test.

In [None]:
!head -n 28000 tweet_emotions.preprocessed.txt > tweet_emotions.preprocessed.train
!tail -n 12000 tweet_emotions.preprocessed.txt > tweet_emotions.preprocessed.valid

3. Train classification model on train subset and predict on the test one

In [None]:
model = fasttext.train_supervised("tweet_emotions.preprocessed.train", epoch=9)

In [None]:
model.test("tweet_emotions.preprocessed.valid")

(12000, 0.3555, 0.3555)

4.  Calculate accuracy (percentage of guessed) emotions on test subset

    Accuracy = Precision = Recall 

    ~0.35

5.  Provide 10 random examples: text / label / predicted label

In [None]:
#  happiness
model.predict("had SUCH and AMAZING time last night, McFly were INCREDIBLE")

(('__label__neutral',), array([0.39876428]))

In [None]:
# love
model.predict("@mopedronin bullet train from tokyo    the gf and i have been visiting japan since thursday  vacation/sightseeing    gaijin godzilla")

(('__label__neutral',), array([0.29186368]))

In [None]:
# fun
model.predict("good morning/midday nation!  FORMULA ONE IN ONE HOUR!")

(('__label__happiness',), array([0.81420887]))

In [None]:
# surprise
model.predict("@BuddingGenius you dont say")

(('__label__neutral',), array([0.83881623]))

In [None]:
# neutral
model.predict("revision...what fun...still I have thursday to do basically nothing")

(('__label__neutral',), array([0.52797586]))

In [None]:
# love
model.predict("@JasonBradbury What brings you to our fair Island today?")

(('__label__worry',), array([0.4625048]))

In [None]:
# fun
model.predict("@skeetonmytwitts its slimy but its fun")

(('__label__happiness',), array([0.81490767]))

In [None]:
# neutral
model.predict("@andyclemmensen whens the sway sway winner announced?")

(('__label__neutral',), array([0.48468545]))

In [None]:
# relief
model.predict("flu or allergy??? ... Doesn't matter, just try to squeeze my Sundayyyy")

(('__label__neutral',), array([0.53090721]))

In [None]:
# neutral
model.predict("Where did I leave my Citeh jersey?")

(('__label__worry',), array([0.67537344]))