In [177]:
import numpy as np
import pandas as pd
import torch
import nltk                                # Python library for NLP
from nltk.corpus import twitter_samples    # sample Twitter dataset from NLTK
import matplotlib.pyplot as plt            # library for visualization
import random                              # pseudo-random number generator
import seaborn as sns
import re                                  # library for regular expression operations
import string                              # for string operations

from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings

In [178]:
df = pd.read_csv('smileannotationsfinal.csv', names=['id', 'text', 'category'])
df.set_index('id', inplace=True)
df.sample(5)

Unnamed: 0_level_0,text,category
id,Unnamed: 1_level_1,Unnamed: 2_level_1
614038714075979776,@NationalGallery Hey :) Appreciate if u check ...,not-relevant
610762492277604352,@NationalGallery Thank you for the information...,happy
614840376877318144,QT“@britishmuseum: New on the blog: Richard Pa...,nocode
610579745433063424,@sheilabarbican @Jon__Nash @PlymouthMuseum @Ba...,happy
608964877483433984,Innovation: The Emperor's New Clothes? Have a ...,nocode


In [179]:
df['category'].value_counts()

nocode               1572
happy                1137
not-relevant          214
angry                  57
surprise               35
sad                    32
happy|surprise         11
happy|sad               9
disgust|angry           7
disgust                 6
sad|disgust             2
sad|angry               2
sad|disgust|angry       1
Name: category, dtype: int64

In [180]:
df.columns

Index(['text', 'category'], dtype='object')

In [181]:
del_rows = ['nocode', 'happy|surprise', 'happy|sad', 'disgust|angry', 'disgust', 'sad|disgust', 'sad|angry', 'sad|disgust|angry']


In [182]:
filt = ~df['category'].isin(del_rows)
df = df.loc[filt, :]

In [183]:
label_dict = {}

for indx, label in enumerate(df['category'].unique()):

    label_dict[label] = indx

label_dict

{'happy': 0, 'not-relevant': 1, 'angry': 2, 'sad': 3, 'surprise': 4}

In [184]:
df['label'] = df['category'].replace(label_dict)
df.sample(5)

Unnamed: 0_level_0,text,category,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
612994714254467072,@britishmuseum @BL_VisualArts,not-relevant,1
612193346543583232,Setting up at St Peters Cambridge for #CastleH...,happy,0
614728418660339712,Must see @tateliverpool #onlyinLiverpool #Poll...,happy,0
608374279328165889,Fantastic to be in London today speaking @ACEn...,happy,0
612297089335328768,@NationalGallery lots of disappointed people a...,sad,3


In [185]:
def preproccess(text):
    # remove old style retweet text "RT"
    text = re.sub(r'^RT[\s]+', '', text)

    # remove hyperlinks
    text = re.sub(r'https?://[^\s\n\r]+', '', text)
    text = re.sub(r'"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"', '', text)
    # remove hashtags
    # only removing the hash # sign from the word
    text = re.sub(r'#', '', text)
    
    return text

In [186]:
df.head(5)

Unnamed: 0_level_0,text,category,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
615253181354901505,"@_TheWhitechapel ""Toward Parliament square at ...",happy,0
610545929637261314,Last RTs - some excellent #AskTheGallery quest...,happy,0
614556909031563264,Re-Defining our beauty - amazing group of crea...,happy,0
614312351043555328,@Hellboy919 @AboutLondon @MadameTussauds @Bate...,happy,0
614497173623996416,Fascinating exhibition coming to @britishmuseu...,happy,0


In [187]:
df['text'] = df['text'].map(preproccess)


In [188]:
df.head(5)

Unnamed: 0_level_0,text,category,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
612947803975876608,TateStIves acquires Barbara Hepworths Palais d...,happy,0
615104809423781888,tateliverpool DavidSandum sooooo wish I was th...,happy,0
613016852588064768,britishmuseum Saw this for the second time on ...,happy,0
611577978661814273,Spent my afternoon admiring the enchanting obj...,happy,0
614767094345936896,StephenXKing profjoecain britishmuseum I took ...,happy,0
