In [29]:
# Import necessary libraries

import pandas as pd
import re
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
stopword_list = stopwords.words('english')
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.corpus import wordnet
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
# Read in and preview 'tesla_tweets.csv'

tweets = pd.read_csv('tesla_tweets.csv', engine='python')
tweets.head()

Unnamed: 0,id,tweet,date_time
0,947702253313495045,Tesla $TSLA Rating Increased to Hold at ValuEn...,2018-01-01 00:33:19
1,947702742264569857,Tesla $TSLA Rating Increased to Hold at ValuEn...,2018-01-01 00:35:15
2,947709627394748416,Horseman Capital Management LTD Increases Posi...,2018-01-01 01:02:37
3,947712900377976832,Tesla INC (TSLA) Holding Lifted by Horseman Ca...,2018-01-01 01:15:37
4,947732179253170177,Insider Selling: Tesla Inc (NASDAQ:TSLA) VP Se...,2018-01-01 02:32:14


In [3]:
# Examine datatypes inside tweets

tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 213213 entries, 0 to 213212
Data columns (total 3 columns):
id           213213 non-null object
tweet        213213 non-null object
date_time    213153 non-null object
dtypes: object(3)
memory usage: 4.9+ MB


In [4]:
# Convert date_time column into datetime stamp

tweets.date_time = pd.to_datetime(tweets.date_time)

In [5]:
# Check for duplicates

tweets[tweets.duplicated()]

Unnamed: 0,id,tweet,date_time


In [6]:
#tweets[tweets.tweet.duplicated()]

## Clean & Tokenize Tweets

In [7]:
def clean_tweets(dataframe):
    # Add whitespace to the end of every tweet
    dataframe['clean_tweet'] = dataframe.tweet.map(lambda x: x + " ") 
    # Remove http links
    dataframe.clean_tweet = dataframe.clean_tweet.map(lambda x: re.sub(r'http.*', '', x))
    # Remove special characters and numbers
    dataframe.clean_tweet = dataframe.clean_tweet.map(lambda x: re.sub(r"[^a-zA-Z#]", ' ', x))
    # Lowercase all tweets
    dataframe.clean_tweet = dataframe.clean_tweet.map(lambda x: x.lower())
    #Tokenize tweets and remove stop words
    stopword_list = stopwords.words('english')
    for i in range(len(dataframe.clean_tweet)):
        tokens = word_tokenize(dataframe.clean_tweet[i])
        clean_tokens = [w for w in tokens if w not in stopword_list]
        dataframe.clean_tweet[i] = clean_tokens


In [9]:
clean_tweets(tweets)
tweets.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,id,tweet,date_time,clean_tweet
0,947702253313495045,Tesla $TSLA Rating Increased to Hold at ValuEn...,2018-01-01 00:33:19,"[tesla, tsla, rating, increased, hold, valueng..."
1,947702742264569857,Tesla $TSLA Rating Increased to Hold at ValuEn...,2018-01-01 00:35:15,"[tesla, tsla, rating, increased, hold, valueng..."
2,947709627394748416,Horseman Capital Management LTD Increases Posi...,2018-01-01 01:02:37,"[horseman, capital, management, ltd, increases..."
3,947712900377976832,Tesla INC (TSLA) Holding Lifted by Horseman Ca...,2018-01-01 01:15:37,"[tesla, inc, tsla, holding, lifted, horseman, ..."
4,947732179253170177,Insider Selling: Tesla Inc (NASDAQ:TSLA) VP Se...,2018-01-01 02:32:14,"[insider, selling, tesla, inc, nasdaq, tsla, v..."


## Lemmatize

In [None]:
#dataframe = test.copy()

In [10]:
tweets.clean_tweet[4]

['insider',
 'selling',
 'tesla',
 'inc',
 'nasdaq',
 'tsla',
 'vp',
 'sells',
 'shares',
 'stock']

In [30]:
def lemmatize_tweet(tweets):
    
    for i in range(len(tweets)):
        # Pos-tag each word in tweet
        for word in [tweets[i]]:
            pos_tag_list = nltk.pos_tag(word)
        # Convert pos-tag to be wordnet compliant
        wordnet_tags = []
        for j in pos_tag_list:
            # Adjective
            if j[1].startswith('J'):
                wordnet_tags.append(wordnet.ADJ)
            # Noun
            elif j[1].startswith('N'):
                wordnet_tags.append(wordnet.NOUN)
            # Adverb
            elif j[1].startswith('R'):
                wordnet_tags.append(wordnet.ADV)
            # Verb
            elif j[1].startswith('V'):
                wordnet_tags.append(wordnet.VERB)
            # Default to noun
            else:
                wordnet_tags.append(wordnet.NOUN)
        # Lemmatize each word in tweet
        lem_words = []
        for k in range(len(tweets[i])):
            lem_words.append(lemmatizer.lemmatize(tweets[i][k], pos=wordnet_tags[k]))
        lem_tweet = ' '.join(lem_words)
        tweets[i] = lem_tweet

In [None]:
lemmatize_tweet(tweets.clean_tweet)
tweets

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [None]:
tweets.clean_tweet[4]

## EDA

In [None]:
#example = dataframe.copy()

### What are the most common words?

In [None]:
# Join all the tweets into a single string
all_words_string = ' '.join([tweet for tweet in tweets.clean_tweet])

In [None]:
# Plot WorldCloud
wordcloud = WordCloud(width=800, height=500, random_state=20, max_font_size=200).generate(all_words_string)
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
# Make a list of all the words in the dataframe

all_words_list = all_words_string.split()

In [None]:
# Find the frequency of each word in the dataframe

word_freq = nltk.FreqDist(all_words_list)
freq_df = pd.DataFrame({'Word': list(word_freq.keys()), 'Count': list(word_freq.values())}).sort_values(by=['Count'], ascending=False)
freq_df.head(10)

In [None]:
# Plot histogram of most frequent words

freq_df = freq_df.nlargest(columns="Count", n = 10) 
plt.figure(figsize=(16,5))
ax = sns.barplot(data=freq_df, x= "Word", y = "Count")
ax.set(ylabel = 'Count')
plt.show()

### What time of day are there the most Tesla tweets?

### What day of the week has the most Tesla tweets?

### What day of the year had the most tweets about Tesla?

## Save the Cleaned Tweets

In [None]:
filepath = r'/Users/erikadauria/Flatiron/Projects/Tesla_Twitter_Sentiment_Analysis/cleaned_tweets.csv'

daily_sentiment.to_csv(filepath, header=True)

test = pd.read_csv('cleaned_tweets.csv')
test.head()