In [1]:
import re
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline

In [3]:
raw_data = pd.read_csv('data/tweets.csv', encoding='latin1')

In [4]:
raw_data.columns = ['text', 'product', 'target']

In [5]:
target_values = {'No emotion toward brand or product':1, 'Positive emotion':2, 'Negative emotion':0, "I can't tell":1}

raw_data['target'] = raw_data['target'].map(target_values)

In [6]:
raw_data = raw_data.dropna(subset=['text']).drop_duplicates(subset=['text']).reset_index(drop=True)

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(raw_data['text'], raw_data['target'], random_state=10)



In [8]:
data = raw_data.copy()

In [15]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt   

In [10]:
data['clean'] = np.vectorize(remove_pattern)(data['text'], '@[\w]*')

In [19]:
a = data['text'][100]

In [20]:
a

'Headline: &quot;#iPad 2 is the Must-Have Gadget at #SXSW&quot; Hmm... I could have seen that one coming! {link} #gadget'

In [21]:
np.vectorize(remove_pattern)(a, '@[\w]*')

array('Headline: &quot;#iPad 2 is the Must-Have Gadget at #SXSW&quot; Hmm... I could have seen that one coming! {link} #gadget',
      dtype='<U119')

In [22]:
re.sub('@[\w]*', '', a)

'Headline: &quot;#iPad 2 is the Must-Have Gadget at #SXSW&quot; Hmm... I could have seen that one coming! {link} #gadget'

In [16]:
np.vectorize(remove_pattern)(a, '@[\w]*')

array('. I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead!  I need to upgrade. Plugin stations at #SXSW.',
      dtype='<U118')

In [32]:
data['clean'] = data['clean'].str.replace('[^a-zA-Z#]', '')

KeyError: 'clean'

In [25]:
b = data['text'] 

In [36]:
c = b.str.replace('[^a-zA-Z#]', ' ')

In [37]:
b[0]

'.@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead!  I need to upgrade. Plugin stations at #SXSW.'

In [38]:
c[0]

'  wesley   I have a  G iPhone  After   hrs tweeting at #RISE Austin  it was dead   I need to upgrade  Plugin stations at #SXSW '

In [29]:
b == c

0       False
1       False
2       False
3       False
4       False
        ...  
9060    False
9061    False
9062    False
9063    False
9064    False
Name: text, Length: 9065, dtype: bool

In [47]:
' I hate you'.replace('at', '')

' I he you'

In [40]:
d = c.apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [44]:
c[0]

'  wesley   I have a  G iPhone  After   hrs tweeting at #RISE Austin  it was dead   I need to upgrade  Plugin stations at #SXSW '

In [45]:
d[0]

'wesley have iPhone After tweeting #RISE Austin dead need upgrade Plugin stations #SXSW'

In [24]:
a.replace('[^a-zA-Z#]', ' ')

'Headline: &quot;#iPad 2 is the Must-Have Gadget at #SXSW&quot; Hmm... I could have seen that one coming! {link} #gadget'

In [12]:
data['clean'] = data['clean'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [13]:
data['clean'].head()

0    have iPhone After tweeting #RISE Austin dead n...
1    Know about Awesome iPad iPhone that likely app...
2     wait #iPad also They should sale them down #SXSW
3    hope this year festival crashy this year iPhon...
4    great stuff #SXSW Marissa Mayer Google Reilly ...
Name: clean, dtype: object

In [14]:
tokenize_data = data['clean'].apply(lambda x: x.split())
tokenize_data.head()

0    [have, iPhone, After, tweeting, #RISE, Austin,...
1    [Know, about, Awesome, iPad, iPhone, that, lik...
2    [wait, #iPad, also, They, should, sale, them, ...
3    [hope, this, year, festival, crashy, this, yea...
4    [great, stuff, #SXSW, Marissa, Mayer, Google, ...
Name: clean, dtype: object

In [15]:
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = nltk.stem.WordNetLemmatizer()

tokenize_data = tokenize_data.apply(lambda x: [lemmatizer.lemmatize(i.lower()) for i in x])
tokenize_data.head()

0    [have, iphone, after, tweeting, #rise, austin,...
1    [know, about, awesome, ipad, iphone, that, lik...
2    [wait, #ipad, also, they, should, sale, them, ...
3    [hope, this, year, festival, crashy, this, yea...
4    [great, stuff, #sxsw, marissa, mayer, google, ...
Name: clean, dtype: object

In [16]:
import nltk

stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(['sxsw', 'sxswi', 'link', 'quot', 'rt', 'apple', 'google', 'iphone', 'ipad', '#sxsw', '#sxswi', '#apple', '#ipad', '#iphone', '#google', 'austin'])

def cleaning(text):
    return [word for word in text if word not in stopwords]

In [17]:
tokenize_data = tokenize_data.apply(cleaning)

In [18]:
tokenize_data

0       [tweeting, #rise, dead, need, upgrade, plugin,...
1       [know, awesome, likely, appreciate, design, al...
2                                      [wait, also, sale]
3                    [hope, year, festival, crashy, year]
4       [great, stuff, marissa, mayer, reilly, tech, b...
                              ...                        
9060                                         [everywhere]
9061    [wave, buzz, interrupt, regularly, scheduled, ...
9062    [zeiger, physician, never, reported, potential...
9063    [verizon, customer, complained, time, fell, ba...
9064                                 [test, check, offer]
Name: clean, Length: 9065, dtype: object

In [19]:
for i in range(len(tokenize_data)):
    tokenize_data[i] = ' '.join(tokenize_data[i])

data['clean'] = tokenize_data

In [20]:
data.isnull().sum()

text          0
product    5785
target        0
clean         0
dtype: int64

In [21]:
data.to_csv('cleaned_tweets.csv', index=False)

In [22]:
clean_data = pd.read_csv('cleaned_tweets.csv')

In [29]:
null_index = clean_data.loc[clean_data['clean'].isnull()].index

In [30]:
data.iloc[null_index]

Unnamed: 0,text,product,target,clean
58,@mention @mention &amp; @mention having fun ...,,1,
242,How I got an iPad 2 during #SXSW! {link} cc @...,,1,
1697,@mention #SXSW #Apple #iPad2 {link},,1,
1873,Win an iPad 2 at SXSW via @mention ! #sxsw {link},iPad,2,
1877,Is the iPad 2 out yet in USA? Or will it be du...,,1,
1882,Win an iPad at SXSW via @mention #sxsw {link},iPad,2,
2249,Just got BBQ on my iPhone #sxsw,,1,
2525,@mention Did you do this in Austin at @mention...,,1,
3628,@mention if you have an iPhone get the #SXSW G...,iPad or iPhone App,1,
3951,@mention Just got my #SXSW Go iphone App! {link},iPad or iPhone App,2,


In [None]:
corpus = ' '.join([text for text in data['clean']])

In [None]:
from wordcloud import WordCloud
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(corpus)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
neutral_tweets = ' '.join([text for text in data.loc[data['target']==1, 'clean']])

wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(neutral_tweets)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
negative_tweets = ' '.join([text for text in data.loc[data['target']==0, 'clean']])

wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(negative_tweets)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
positive_tweets = ' '.join([text for text in data.loc[data['target']==2, 'clean']])

wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(positive_tweets)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
def hashtag_extract(x):
    hashtags = []
    for i in x:
        ht = re.findall(r"#(\w+)", i)
        hashtags.append(ht)

    return hashtags

In [None]:
neutral_ht = hashtag_extract(data.loc[data['target']==1, 'clean'])

negative_ht = hashtag_extract(data.loc[data['target']==0, 'clean'])

positive_ht = hashtag_extract(data.loc[data['target']==2, 'clean'])

neutral_ht = sum(neutral_ht,[])
negative_ht = sum(negative_ht,[])
positive_ht = sum(positive_ht,[])

In [None]:
neutral_a = nltk.FreqDist(neutral_ht)
neutral_d = pd.DataFrame({'Hashtag': list(neutral_a.keys()),
                  'Count': list(neutral_a.values())})
    
neutral_d = neutral_d.nlargest(columns="Count", n = 10) 
plt.figure(figsize=(16,5))
ax = sns.barplot(data=neutral_d, x= "Hashtag", y = "Count")
ax.set(ylabel = 'Count')
plt.show()

In [None]:
negative_a = nltk.FreqDist(negative_ht)
negative_d = pd.DataFrame({'Hashtag': list(negative_a.keys()),
                  'Count': list(negative_a.values())})
    
negative_d = negative_d.nlargest(columns="Count", n = 10) 
plt.figure(figsize=(16,5))
ax = sns.barplot(data=negative_d, x= "Hashtag", y = "Count")
ax.set(ylabel = 'Count')
plt.show()

In [None]:
positive_a = nltk.FreqDist(positive_ht)
positive_d = pd.DataFrame({'Hashtag': list(positive_a.keys()),
                  'Count': list(positive_a.values())})
    
positive_d = positive_d.nlargest(columns="Count", n = 10) 
plt.figure(figsize=(16,5))
ax = sns.barplot(data=positive_d, x= "Hashtag", y = "Count")
ax.set(ylabel = 'Count')
plt.show()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
# bag-of-words feature matrix
bow = bow_vectorizer.fit_transform(data['clean'])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
# TF-IDF feature matrix
tfidf = tfidf_vectorizer.fit_transform(data['clean'])

In [None]:
bow.shape

In [None]:
bow[:31962,:]

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

train_bow = bow[:31962,:]
test_bow = bow[31962:,:]

# splitting data into training and validation set
xtrain_bow, xvalid_bow, ytrain, yvalid = train_test_split(train_bow, data['target'], random_state=42, test_size=0.3)

lreg = LogisticRegression(max_iter=1000, random_state=1015)
lreg.fit(xtrain_bow, ytrain) # training the model

prediction = lreg.predict_proba(xvalid_bow) # predicting on the validation set
prediction_int = prediction[:,1] >= 0.3 # if prediction is greater than or equal to 0.3 than 1 else 0
prediction_int = prediction_int.astype(np.int)

accuracy_score(yvalid, prediction_int)

In [None]:
test_pred = lreg.predict_proba(test_bow)
test_pred_int = test_pred[:,1] >= 0.3
test_pred_int = test_pred_int.astype(np.int)
test['label'] = test_pred_int
submission = test[['id','label']]