In [27]:
# import necessary libraries
import pandas as pd
import re
import string,time
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer,WordNetLemmatizer

# define utility funtions

def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

def remove_punc(text):
    exclude = string.punctuation
    for char in exclude:
        text = text.replace(char,'')
    return text

def remove_punc1(text):
    exclude = string.punctuation
    return text.translate(str.maketrans('', '', exclude))


def chat_conversion(text):
    chat_words = {
        'AFAIK':'as far as i know',
        'AFK':'away from keyboard',
        'ASAP':'as soon as possible'
    }
    new_text = []
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

def remove_stopwords(text):
    new_text = []

    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_stop_words(text):
    sentence = word_tokenize(text)
    words = [ word for word in sentence if not word in stopwords.word('english')]
    return ' '.join(words)

def remove_stop_words(text):
    sentence = word_tokenize(text)
    words = [ word for word in sentence if not word in stopwords.words('english')]
    return ' '.join(words)

def lemmatize_words(text):
    lem = WordNetLemmatizer()
    sentence = word_tokenize(text)
    words = [ lem.lemmatize(word) for word in sentence]
    return ' '.join(words)
    
def stem_words(text):
    stemmer = PorterStemmer()
    sentence = word_tokenize(text)
    words = [ stemmer.stem(word) for word in sentence]
    return ' '.join(words)

In [2]:
# load data set
df = pd.read_csv("data/Twitter Sentiments.csv");
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [3]:
df['tweet'][0]

' @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run'

In [4]:
df.shape

(31962, 3)

# Steps for data preprocessing

### 1.lower case
### 2.remove_html_tags
### 3.remove_url
### 4.punctuation handling
### 5.chat_conversion handle
### 6.incorrect_text handling
### 7.stopwords
### 8.remove_emoji handle
### 9.Tokenization
### 10.Stemming 
### 11.Lemmatizing

In [5]:
df_copy = df.copy()
from nltk.tokenize import sent_tokenize

In [6]:
# lowering
df_copy['tweet'] = df_copy['tweet'].str.lower()
df_copy.tweet[0]

' @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run'

In [7]:
#remove html tags
df_copy['tweet'] = df_copy['tweet'].apply(remove_html_tags)

In [8]:
# remove URLs
df_copy['tweet'] = df_copy['tweet'].apply(remove_url)

In [9]:
# remove punctuation
df_copy['tweet'] = df_copy['tweet'].apply(remove_punc1)

In [10]:
df_copy.tweet[3]

'model   i love u take with u all the time in urð\x9f\x93± ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  '

In [11]:
# chat conversion
df_copy.tweet[4018]

'user hey when are u guys gonna release the gtx 1080 strix in australia i needa buy it asap cant wait no longer hahahahaaa  '

In [12]:
df_copy['tweet'] = df_copy['tweet'].apply(chat_conversion)
df_copy.tweet[4018]

'user hey when are u guys gonna release the gtx 1080 strix in australia i needa buy it as soon as possible cant wait no longer hahahahaaa'

In [15]:
#incorrect text handling
from textblob import TextBlob


In [24]:
#df_copy['tweet'] = df_copy['tweet'].apply(lambda text: TextBlob(text).correct().string)

In [29]:
# remove stopwords
df_copy['tweet'] = df_copy['tweet'].apply(remove_stop_words)
df_copy.tweet[4018]

'user hey u guys gon na release gtx 1080 strix australia needa buy soon possible cant wait longer hahahahaaa'

In [39]:
# remove emojis
df_copy['tweet'] = df_copy['tweet'].apply(remove_emoji)
df_copy.tweet[3]

'model love u take u time urð\x9f\x93± ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91 ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦'

In [53]:
#remove special characters
reg = r'[^a-zA-Z]'
df_copy['tweet'] = df_copy['tweet'].apply(lambda text : re.sub(reg,' ',text))
df_copy.tweet[3]

'model love u take u time ur                                 '

In [54]:
#lemmatizing the words
df_copy['tweet'] = df_copy['tweet'].apply(lemmatize_words)

In [55]:
df_copy['tweet']

0        user father dysfunctional selfish drag kid dys...
1        user user thanks lyft credit cant use cause do...
2                                           bihday majesty
3                              model love u take u time ur
4                            factsguide society motivation
                               ...                        
31957                                   ate user isz youuu
31958    see nina turner airwave trying wrap mantle gen...
31959       listening sad song monday morning otw work sad
31960    user sikh temple vandalised calgary wso condem...
31961                                    thank user follow
Name: tweet, Length: 31962, dtype: object

# Bag Of Words

In [68]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [69]:
bow = cv.fit_transform(df_copy['tweet'])

In [70]:
len(cv.vocabulary_)

37098

In [71]:
bow.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [72]:
print(bow[0].toarray())

[[0 0 0 ... 0 0 0]]
