# Sentiment Analysis of Crude Oil Investor Tweets 
<img src="image/oil.jpg" width=300 height=300 /> 
<br/>
<center>Source:
    https://unsplash.com/photos/GrmwVnVSSdU 
<br/>
<p style='text-align: left;'> This notebook will dicuss how to perform a sentiment analysis and create an exploratory data visualization, using dataset from https://www.kaggle.com/yassinehamdaoui1/sentiment-analysis-in-energy-crude-oil </p>

## Data conditioning

In [1]:
import pandas as pd

df = pd.read_csv('tweets.csv')
df.dropna(axis=1,inplace=True)

# drop unused column
df.drop(columns=['created', 'statusSource', 'favorited', 'retweeted'], inplace=True)
print(df.dtypes)
print('\n')
print(df.head())

text             object
favoriteCount     int64
truncated          bool
id                int64
screenName       object
retweetCount      int64
isRetweet          bool
dtype: object


                                                text  favoriteCount  \
0  With domestic output exceeding regional demand...              0   
1  For Friday, here is US Crude Oil front 12 mont...              0   
2  RT @aeberman12: OPEC "pumped 29.6 mmbpd last m...              0   
3  RT @aeberman12: "Ships vanishing from tracking...              0   
4  OPEC "pumped 29.6 mmbpd last month, down 170 k...              4   

   truncated                   id   screenName  retweetCount  isRetweet  
0       True  1147125982907813888   ArgusMedia             0      False  
1       True  1147125489103974400  macrohedged             0      False  
2      False  1147122950753181697       El_Spy             1       True  
3      False  1147122646003437568  SuperiorMar             1       True  
4       True  11471

## HTML parser

In [2]:
from bs4 import BeautifulSoup
temp=[]
for i in df['text']:
    soup = BeautifulSoup(i, "html.parser")
    parsed_text = soup.get_text()
    temp.append(parsed_text)

df['text'] = pd.DataFrame(temp)
print(df['text'])

0      With domestic output exceeding regional demand...
1      For Friday, here is US Crude Oil front 12 mont...
2      RT @aeberman12: OPEC "pumped 29.6 mmbpd last m...
3      RT @aeberman12: "Ships vanishing from tracking...
4      OPEC "pumped 29.6 mmbpd last month, down 170 k...
                             ...                        
995    RT @FollowCII: CII suggests reducing dependenc...
996    RT @AlexisAshi: #Oil dips ⬇️ on demand worries...
997    RT @Fx_Junkies: #UnionJackOil From Malcy's blo...
998    #Index of 8 Core #Industries\n#steel\n#Coal\n#...
999    #CrudeOil \nOn higher side Will sell at 4106,4...
Name: text, Length: 1000, dtype: object


## Remove URL

In [3]:
import re
temp=[]
for i in df['text']:
    strip_url = re.sub(r'http\S+', '', i)
    # \S+ means to not select character starts with http and end with any string
    # replace with none (''), in i (text)
    # \S+ means %s in token
    temp.append(strip_url)

df['text'] = pd.DataFrame(temp) 
print(df['text'])

0      With domestic output exceeding regional demand...
1      For Friday, here is US Crude Oil front 12 mont...
2      RT @aeberman12: OPEC "pumped 29.6 mmbpd last m...
3      RT @aeberman12: "Ships vanishing from tracking...
4      OPEC "pumped 29.6 mmbpd last month, down 170 k...
                             ...                        
995    RT @FollowCII: CII suggests reducing dependenc...
996    RT @AlexisAshi: #Oil dips ⬇️ on demand worries...
997    RT @Fx_Junkies: #UnionJackOil From Malcy's blo...
998    #Index of 8 Core #Industries\n#steel\n#Coal\n#...
999    #CrudeOil \nOn higher side Will sell at 4106,4...
Name: text, Length: 1000, dtype: object


## Remove punctuation

In [4]:
from string import punctuation
temp=[]
for i in df['text']:
    i = i.replace(punctuation, '')
    temp.append(i)

df['text'] = pd.DataFrame(temp)
print(df['text'])

0      With domestic output exceeding regional demand...
1      For Friday, here is US Crude Oil front 12 mont...
2      RT @aeberman12: OPEC "pumped 29.6 mmbpd last m...
3      RT @aeberman12: "Ships vanishing from tracking...
4      OPEC "pumped 29.6 mmbpd last month, down 170 k...
                             ...                        
995    RT @FollowCII: CII suggests reducing dependenc...
996    RT @AlexisAshi: #Oil dips ⬇️ on demand worries...
997    RT @Fx_Junkies: #UnionJackOil From Malcy's blo...
998    #Index of 8 Core #Industries\n#steel\n#Coal\n#...
999    #CrudeOil \nOn higher side Will sell at 4106,4...
Name: text, Length: 1000, dtype: object


## Remove any characters with "@", "#", and RT

In [5]:

temp = []
for i in df['text']:
    i = re.sub(r'@\S+', '', i)
    i = re.sub(r'#\S+', '', i)
    i = i.replace('RT', '')
    temp.append(i)

df['text'] = pd.DataFrame(temp)
print(df['text'])

0      With domestic output exceeding regional demand...
1      For Friday, here is US Crude Oil front 12 mont...
2        OPEC "pumped 29.6 mmbpd last month, down 170...
3        "Ships vanishing from tracking screens, clan...
4      OPEC "pumped 29.6 mmbpd last month, down 170 k...
                             ...                        
995      CII suggests reducing dependence on  imports...
996       dips ⬇️ on demand worries despite  cut exte...
997       From Malcy's blog today: "My view of  has b...
998                of 8 Core \n\n\n\n\n\n\n and\n\nReg… 
999     \nOn higher side Will sell at 4106,4128 SL 41...
Name: text, Length: 1000, dtype: object


## Remove characters with number only

In [6]:
temp = []
for i in df['text']:
    i = re.sub(" \d+", " ", i)
    temp.append(i)

df['text'] = pd.DataFrame(temp)
print(df['text'])

0      With domestic output exceeding regional demand...
1      For Friday, here is US Crude Oil front  months...
2        OPEC "pumped .6 mmbpd last month, down  kb/d...
3        "Ships vanishing from tracking screens, clan...
4      OPEC "pumped .6 mmbpd last month, down  kb/d f...
                             ...                        
995      CII suggests reducing dependence on  imports...
996       dips ⬇️ on demand worries despite  cut exte...
997       From Malcy's blog today: "My view of  has b...
998                 of  Core \n\n\n\n\n\n\n and\n\nReg… 
999      \nOn higher side Will sell at ,4128 SL  Target 
Name: text, Length: 1000, dtype: object


## Language checking

In [7]:
import language_tool_python
tool = language_tool_python.LanguageTool('en-US') 
temp = []
for i in df['text']:
    i = tool.correct(i)
    temp.append(i)

df['text'] = pd.DataFrame(temp)
print(df['text'])

0      With domestic output exceeding regional demand...
1      For Friday, here is US Crude Oil front months ...
2       OPEC “pumped .6 MMP last month, down KB/d fro...
3       “Ships vanishing from tracking screens, cland...
4      OPEC “pumped .6 MMP last month, down KB/d from...
                             ...                        
995     CII suggests reducing dependence on imports b...
996     Dips ☀️ on demand worries despite cut extensi...
997     From Malay's blog today: “My view of has been...
998                  Of Core \n\n\n\n\n\n\n and\n\nReg… 
999       \nOn higher side Will sell at, 4128 SL Target 
Name: text, Length: 1000, dtype: object


## Remove stop words (english)

In [8]:
import nltk
from nltk.corpus import stopwords
# nltk.download('stopwords')
# nltk.download('punkt')
stop = stopwords.words('english')

df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
print(df['text'])

0      With domestic output exceeding regional demand...
1      For Friday, US Crude Oil front months futures ...
2      OPEC “pumped .6 MMP last month, KB/d May’s rev...
3      “Ships vanishing tracking screens, clandestine...
4      OPEC “pumped .6 MMP last month, KB/d May’s rev...
                             ...                        
995    CII suggests reducing dependence imports chang...
996        Dips ☀️ demand worries despite cut extension.
997    From Malay's blog today: “My view clear, massi...
998                                         Of Core Reg…
999          On higher side Will sell at, 4128 SL Target
Name: text, Length: 1000, dtype: object


## Stemming

In [9]:
ps = nltk.porter.PorterStemmer()

temp=[]
for i in df['text']:
    i = ' '.join([ps.stem(word) for word in i.split()])
    temp.append(i)

df['text'] = pd.DataFrame(temp)
print(df['text'])

0      with domest output exceed region demand, US pr...
1      for friday, US crude oil front month futur pri...
2      opec “pump .6 mmp last month, kb/d may’ revis ...
3      “ship vanish track screens, clandestin transfe...
4      opec “pump .6 mmp last month, kb/d may’ revis ...
                             ...                        
995    cii suggest reduc depend import chang energi mix.
996            dip ☀️ demand worri despit cut extension.
997    from malay' blog today: “mi view clear, massiv...
998                                         Of core reg…
999          On higher side will sell at, 4128 SL target
Name: text, Length: 1000, dtype: object


## Sentiment analysis

In [10]:
from textblob import TextBlob
df['polarity'] = df['text'].apply(lambda x: round(TextBlob(x).sentiment.polarity,2))
df['subjectivity'] = df['text'].apply(lambda x: round(TextBlob(x).sentiment.subjectivity,2))
                                  
temp = []
for i in df['polarity']:
    if i < 0:
        s = 'negative'
    elif i >0:
        s = 'positive'
    else:
        s = 'neutral'
    temp.append(s)
df['sentiment'] = pd.DataFrame(temp)
print(df.head())

                                                text  favoriteCount  \
0  with domest output exceed region demand, US pr...              0   
1  for friday, US crude oil front month futur pri...              0   
2  opec “pump .6 mmp last month, kb/d may’ revis ...              0   
3  “ship vanish track screens, clandestin transfe...              0   
4  opec “pump .6 mmp last month, kb/d may’ revis ...              4   

   truncated                   id   screenName  retweetCount  isRetweet  \
0       True  1147125982907813888   ArgusMedia             0      False   
1       True  1147125489103974400  macrohedged             0      False   
2      False  1147122950753181697       El_Spy             1       True   
3      False  1147122646003437568  SuperiorMar             1       True   
4       True  1147122604551155712   aeberman12             1      False   

   polarity  subjectivity sentiment  
0      0.00          0.00   neutral  
1     -0.70          1.00  negative  
2      0

In [21]:
import altair as alt
alt.Chart(df).mark_point().encode(x='favoriteCount',y='retweetCount',color = 'sentiment')

In [18]:
alt.Chart(df).mark_bar().encode(
    y='count()',
    x='screenName',
    color = 'sentiment'
)

In [24]:
brush = alt.selection(type='interval')

scat = alt.Chart().mark_point().encode(
    alt.X('polarity:Q', title='Polarity',scale = alt.Scale(zero=False)),
    alt.Y('subjectivity:Q', title='Subjectivity',scale = alt.Scale(zero=False)),
    alt.Color('sentiment',scale = alt.Scale(scheme='turbo')),
).properties(
    width=420,
    height=400
).add_selection(brush)

#transform bin: specify the dummy parameter (Depth_binned) in y axis
#transform with alt.Bin using the parameter in the dataframe (Depth)
bars = alt.Chart().mark_bar().encode(
    x='polarity:O',
    y='count()',
    color=alt.Color('isRetweet',scale = alt.Scale(scheme='set1'))
).properties(
    height=400
).transform_filter(
    brush
).transform_bin(
    'polarity', field = 'polarity',bin=alt.Bin(maxbins=15))

alt.hconcat(scat, bars, data=df)