In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from textblob import TextBlob

import nltk

In [5]:
data = pd.read_csv("Elon_musk.csv", encoding="Latin-1")
data.head()

Unnamed: 0.1,Unnamed: 0,Text
0,1,@kunalb11 Im an alien
1,2,@ID_AA_Carmack Ray tracing on Cyberpunk with H...
2,3,@joerogan @Spotify Great interview!
3,4,@gtera27 Doge is underestimated
4,5,@teslacn Congratulations Tesla China for amazi...


In [6]:
data["word_count"] = data["Text"].apply(lambda x: len(str(x).split(" ")))
data[["Text", "word_count"]].head()

Unnamed: 0,Text,word_count
0,@kunalb11 Im an alien,4
1,@ID_AA_Carmack Ray tracing on Cyberpunk with H...,13
2,@joerogan @Spotify Great interview!,4
3,@gtera27 Doge is underestimated,4
4,@teslacn Congratulations Tesla China for amazi...,17


In [7]:
data["char_count"] = data["Text"].str.len()
data[["Text", "char_count"]].head()

Unnamed: 0,Text,char_count
0,@kunalb11 Im an alien,22
1,@ID_AA_Carmack Ray tracing on Cyberpunk with H...,82
2,@joerogan @Spotify Great interview!,35
3,@gtera27 Doge is underestimated,31
4,@teslacn Congratulations Tesla China for amazi...,104


In [9]:
def avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words))

data["avg_word"] = data["Text"].apply(lambda x: avg_word(x))
data[["Text", "avg_word"]].head()

Unnamed: 0,Text,avg_word
0,@kunalb11 Im an alien,4.75
1,@ID_AA_Carmack Ray tracing on Cyberpunk with H...,5.384615
2,@joerogan @Spotify Great interview!,8.0
3,@gtera27 Doge is underestimated,7.0
4,@teslacn Congratulations Tesla China for amazi...,5.176471


In [12]:
stop = stopwords.words("english")
data["stopwords"] = data["Text"].apply(lambda x: len([x for x in x.split() if x in stop]))
data[["Text", "stopwords"]].head()

Unnamed: 0,Text,stopwords
0,@kunalb11 Im an alien,1
1,@ID_AA_Carmack Ray tracing on Cyberpunk with H...,4
2,@joerogan @Spotify Great interview!,0
3,@gtera27 Doge is underestimated,1
4,@teslacn Congratulations Tesla China for amazi...,5


In [15]:
data["hashtags"] = data["Text"].apply(lambda x: len([x for x in x.split() if x.startswith("@")]))
data[["Text", "hashtags"]].head()

Unnamed: 0,Text,hashtags
0,@kunalb11 Im an alien,1
1,@ID_AA_Carmack Ray tracing on Cyberpunk with H...,1
2,@joerogan @Spotify Great interview!,2
3,@gtera27 Doge is underestimated,1
4,@teslacn Congratulations Tesla China for amazi...,1


In [17]:
data["Numerics"] = data["Text"].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
data[["Text", "Numerics"]].head()

Unnamed: 0,Text,Numerics
0,@kunalb11 Im an alien,0
1,@ID_AA_Carmack Ray tracing on Cyberpunk with H...,0
2,@joerogan @Spotify Great interview!,0
3,@gtera27 Doge is underestimated,0
4,@teslacn Congratulations Tesla China for amazi...,0


# Preprocessing

In [18]:
data["Text"] = data["Text"].apply(lambda x: " ".join(x.lower() for x in x.split()))
data["Text"].head()

0                               @kunalb11 im an alien
1    @id_aa_carmack ray tracing on cyberpunk with h...
2                  @joerogan @spotify great interview!
3                      @gtera27 doge is underestimated
4    @teslacn congratulations tesla china for amazi...
Name: Text, dtype: object

In [19]:
data["Text"] = data["Text"].str.replace('[^\w\s]','')
data["Text"].head()

  data["Text"] = data["Text"].str.replace('[^\w\s]','')


0                                 kunalb11 im an alien
1    id_aa_carmack ray tracing on cyberpunk with hd...
2                     joerogan spotify great interview
3                       gtera27 doge is underestimated
4    teslacn congratulations tesla china for amazin...
Name: Text, dtype: object

In [21]:
stop = stopwords.words("english")
data["Text"] = data["Text"].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
data["Text"].head()

0                                    kunalb11 im alien
1    id_aa_carmack ray tracing cyberpunk hdr nextle...
2                     joerogan spotify great interview
3                          gtera27 doge underestimated
4    teslacn congratulations tesla china amazing ex...
Name: Text, dtype: object

In [22]:
freq = pd.Series(' '.join(data["Text"]).split()).value_counts()[:10]
freq

spacex            239
amp               218
tesla             166
erdayastronaut    142
rt                127
ppathole          123
flcnhvy           114
yes                86
great              76
teslaownerssv      73
dtype: int64

In [26]:
freq = list(freq.index)
data["Text"] = data["Text"].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
data["Text"].head()

0                                    kunalb11 im alien
1    id_aa_carmack ray tracing cyberpunk hdr nextle...
2                           joerogan spotify interview
3                          gtera27 doge underestimated
4    teslacn congratulations china amazing executio...
Name: Text, dtype: object

In [31]:
freq = pd.Series(" ".join(data["Text"]).split()).value_counts()[-10:]
freq

nyquil                1
musk                  1
negati                1
httpstco6ohta09s5l    1
carousel              1
joeingeneral          1
andrewbogut           1
typical               1
unusual               1
altho                 1
dtype: int64

In [32]:
freq = list(freq.index)

['nyquil',
 'musk',
 'negati',
 'httpstco6ohta09s5l',
 'carousel',
 'joeingeneral',
 'andrewbogut',
 'typical',
 'unusual',
 'altho']

In [33]:
data["Text"][:5].apply(lambda x: str(TextBlob(x).correct()))

0                                    kunalb11 in alien
1    id_aa_carmack ray tracing cyberpunk her nextle...
2                           joerogan specify interview
3                          gtera27 done underestimated
4    teslacn congratulations china amazing executio...
Name: Text, dtype: object

In [34]:
TextBlob(data["Text"][1]).words

WordList(['id_aa_carmack', 'ray', 'tracing', 'cyberpunk', 'hdr', 'nextlevel', 'tried'])

In [36]:
from nltk.stem import PorterStemmer
st = PorterStemmer()
data["Text"][:5].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

0                                    kunalb11 im alien
1    id_aa_carmack ray trace cyberpunk hdr nextleve...
2                           joerogan spotifi interview
3                              gtera27 doge underestim
4    teslacn congratul china amaz execut last year ...
Name: Text, dtype: object

In [40]:
from textblob import Word

In [42]:
#Lemmatization

data["Text"] = data["Text"].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
data["Text"].head()

0                                    kunalb11 im alien
1    id_aa_carmack ray tracing cyberpunk hdr nextle...
2                           joerogan spotify interview
3                          gtera27 doge underestimated
4    teslacn congratulation china amazing execution...
Name: Text, dtype: object

In [44]:
# N grams
#TextBlob(data["Text"][0]).ngrams(2)

#tf1 = (data["Text"][1:2].apply(lambda x: pd.value_counts(x.split(" ")))).sum(axis=0).reset_index()


# Emotion Mining

In [53]:
import codecs 
import re
import copy
import collections
import pandas as pd
import numpy as np
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords

%matplotlib inline

In [52]:
from __future__ import division
import os
from nltk.corpus import twitter_samples

In [56]:
with codecs.open("positive-words.txt", "r", encoding="utf-8") as p:
    pos = p.read()

In [60]:
with codecs.open("negative-words.txt","r") as n:
    neg = n.read()

In [61]:
with codecs.open("stop.txt", "r") as s:
    stop = s.read()

In [68]:
pt = twitter_samples.strings('positive_tweets.json')
nt = twitter_samples.strings('negative_tweets.json')
test = twitter_samples.strings('negative_tweets.json')

In [69]:
ttoken = twitter_samples.tokenized()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\marang\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True