# Import section

In [None]:
# python standard library
import json
import datetime as dt

# data science tools
import numpy as np
import pandas as pd
from pandas.tseries.offsets import MonthEnd
from pandas.tseries.offsets import DateOffset


# plotting tools
import matplotlib.pyplot as plt
import matplotlib.ticker as plticker
import seaborn as sns

# scikit-learn
from sklearn.neighbors import KernelDensity
from scipy.stats import pearsonr, ttest_ind, mannwhitneyu

# natural language processing tools
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
nltk.download('vader_lexicon')

# other helper tools
from tqdm import tqdm
from PIL import Image

<br><br><br><br><br><br>
# SECTION 1: Initial analysis of Apple quotes
When searching for Apple quotes, we must discard food-related quotes while including quotes about Apple Inc. Therefore, we will use a rule-based approach, where we have a list of indicator words that decide whether to include a quotation. This way, we will be left with a filtered dataset containing mostly quotations related to the company Apple. This also enables us to utilize more extensive filtration algorithms, which would be too costly to run on the entire dataset. We have created a data preprocessing pipeline, which as a result has created a csv-file for each year, consisting only of Apple-related quotes.

Let's first load this data, as well as the relevant dates of the Apple product launches.


In [None]:
df_initial_analysis = pd.read_csv("data/quotes-2020-apple-filter.csv",sep=";")
df_initial_analysis = df_initial_analysis.append(pd.read_csv("data/quotes-2019-apple-filter.csv",sep=";"))
df_initial_analysis = df_initial_analysis.append(pd.read_csv("data/quotes-2018-apple-filter.csv",sep=";"))
df_initial_analysis = df_initial_analysis.append(pd.read_csv("data/quotes-2017-apple-filter.csv",sep=";"))
df_initial_analysis = df_initial_analysis.append(pd.read_csv("data/quotes-2016-apple-filter.csv",sep=";"))
df_initial_analysis = df_initial_analysis.append(pd.read_csv("data/quotes-2015-apple-filter.csv",sep=";"))
#List of dates for the apple events
apple_event_dates_str=["2015-03-09","2015-06-10","2015-09-09",
"2016-03-21","2016-06-15","2016-09-07", "2016-10-27",
"2017-06-07", "2017-09-12",
"2018-03-27","2018-06-06", "2018-09-12", "2018-10-30",
"2019-03-25","2019-06-05","2019-09-10","2019-12-02",
"2020-06-24","2020-09-15","2020-10-13","2020-11-10"]

#apple_event_dates_pd = pd.DataFrame({'Date':[dt.datetime.strptime(date, "%Y-%m-%d").date() for date in apple_event_dates_str]})
apple_event_dates=[dt.datetime.strptime(date, "%Y-%m-%d").date() for date in apple_event_dates_str]

In [None]:
df_initial_analysis

As seen above, we have gathered approximately 90.000 quotes from the quotebank data through 2015 to 2020. Lets start by investigating how the quote occurrences vary throughout the year

## How does the daily number of quotes vary throughout the year?

In [None]:
# Clean the date column, such that it only contains date information and not timestamp
df_initial_analysis['date_clean'] = df_initial_analysis.apply(lambda x: x['date'][:10],axis=1)

In [None]:
# Create plot to see the Apple-related quotes through the years
time = df_initial_analysis.groupby(['date_clean']).sum().index
quote_num = df_initial_analysis.groupby(['date_clean']).sum()['numOccurrences']
fig,ax = plt.subplots(figsize=(18,8))
ax.bar(time,quote_num,width=3, align='center')
ax.set_xlabel("dates",size=15)
plt.xticks(time[::30],rotation=90)
ax.set_ylabel("Quotation occurrences",size=15)
ax.set_title("Number of daily Apple-related Citations 2015 through 2020",size=20)
plt.show()

We see a massive spike close to the date 2018-04-23, which seems rather suspicious. Let's investigate this further, by finding the particular quote and print its information.

In [None]:
# Print info about outlier quote
print(f"Quote occurrences: {df_initial_analysis.loc[df_initial_analysis.numOccurrences == df_initial_analysis.numOccurrences.max()].numOccurrences.item()} ")
print(f"Quote: {df_initial_analysis.loc[df_initial_analysis.numOccurrences == df_initial_analysis.numOccurrences.max()].quotation.item()}")

It seems unlikely that such a generic quote has such a high occurrence number. There is the possibility that the quote originates from a viral tweet or advertisement, however, googling the quote yields no results. We will thus regard this as an extreme outlier for now, and investigate it further by comparing to the original data set in Milestone 3. 

In [None]:
# Drop outlier
df_initial_analysis = df_initial_analysis.drop(df_initial_analysis.loc[df_initial_analysis.numOccurrences == 39978].index)

Now that the outlier has been dropped, let's take a look at the plot again. 

In [None]:
# Create plot to see the Apple-related quotes through the years (without outlier)
time = df_initial_analysis.groupby(['date_clean']).sum().index
quote_num = df_initial_analysis.groupby(['date_clean']).sum()['numOccurrences']
fig,ax = plt.subplots(figsize=(18,8))
ax.bar(time,quote_num,width=3, align='center')
ax.set_xlabel("dates",size=15)
plt.xticks(time[::30],rotation=90)
ax.set_ylabel("Quotation occurrences",size=15)
ax.set_title("Number of daily Apple-related Citations 2015 through 2020",size=20)
plt.show()

This looks more reasonable. Let's try to look if any of the peaks occur at the same time as apple events

In [None]:
# add apple product launches as points
fig,ax = plt.subplots(figsize=(18,8))
ax.plot(time,quote_num, label= "Quote occurrences")
ax.set_xlabel("dates",size=15)
ax.scatter(time[time.isin(apple_event_dates_str)],quote_num[time.isin(apple_event_dates_str)], c="r", s=100, zorder=3, label='Apple events')
plt.xticks(time[::30],rotation=90)
ax.set_ylabel("Quotation occurrences",size=15)
ax.set_title("Number of daily Apple-related Citations 2015 through 2020",size=20)
ax.legend(prop={'size': 20})
plt.show()


Many of the peaks are very close to Apple product launches. In Milestone 3 we will investigate this further.

## 3. How does the distribution of quote occurences look?

We will now take a quick glance at how the distribution of Apple quotes look.

In [None]:
fig,ax = plt.subplots(figsize=(15,8))
ax.hist(df_initial_analysis.numOccurrences[df_initial_analysis.numOccurrences < 60],bins=40,);
ax.set_xlabel("Occurrence number",size=15)
ax.set_ylabel("Frequency",size=15)
ax.set_title("Distribution of quote occurrences (Occurrences capped at 60)",size=20)


As seen above, most of the quotes have been cited less than 10 times.

## 4. How does the most frequent speakers vary throughout the year?

### Investigation of frequent speakers

We will now investigate which speakers are the most cited when it comes to Apple. 


In [None]:
top_speakers = df_initial_analysis.loc[df_initial_analysis.speaker != 'None'].groupby(by=['speaker']).sum().sort_values(by='numOccurrences',ascending=False).reset_index()
fig,ax = plt.subplots(figsize=(20,8))
ax.bar(top_speakers.speaker[:10],top_speakers.numOccurrences[:10])
ax.set_xlabel("Author of quote",size=15)
ax.set_ylabel("Quotation occurrences",size=15)
#plt.xticks(top_speakers.speaker1[:10],rotation=90)
ax.set_title("Authors with most qoute occurrences (2015-2020)",size=20)
plt.show()

We see that Apple-related staff has the most quotes related to Apple e.g. Tim Cook, Steve Jobs, Eddy Cue, Jony Ive and Steve Wozniak. We now want to portray how this has evolved throughout the years. We will do this by making a column for each speaker. This enables us to plot the most active speakers over the period. Note that Donald Trump has several entries. The the code below will fix this as well.

In [None]:
#Extracting dataset and formatting date
quotes_per_date = df_initial_analysis.copy()
quotes_per_date['date'] = quotes_per_date['date'].apply(lambda x: dt.datetime.strptime(x[:10], '%Y-%m-%d'))

#Extracting dataset and formatting date to show only month
quotes_per_month = df_initial_analysis.copy()
quotes_per_month['date'] = quotes_per_month['date'].apply(lambda x: dt.datetime.strptime(x[:7], '%Y-%m'))

#Pivoting table such that each column is a speaker and index is date
#Merging Donald Trump quotations and dropping None as well
quotes_per_date = quotes_per_date.pivot_table('numOccurrences', ['date'], 'speaker', fill_value=0, aggfunc=np.sum)
quotes_per_date['Donald Trump'] = quotes_per_date['Donald Trump'] + \
                                  quotes_per_date['President Donald Trump'] + \
                                  quotes_per_date['President Trump']
quotes_per_date.drop(['None', 'President Donald Trump', 'President Trump'], axis=1, inplace=True)

quotes_per_month = quotes_per_month.pivot_table('numOccurrences', ['date'], 'speaker', fill_value=0, aggfunc=np.sum)
quotes_per_month['Donald Trump'] = quotes_per_month['Donald Trump'] + \
                                   quotes_per_month['President Donald Trump'] + \
                                   quotes_per_month['President Trump']
quotes_per_month.drop(['None', 'President Donald Trump', 'President Trump'], axis=1, inplace=True)

#Cummulative representation of quotes
cummulative_quotes_per_date = quotes_per_date.cumsum(axis=0)
cummulative_quotes_per_month = quotes_per_month.cumsum(axis=0)

Due to visibility reasons we have chosen to only display the three most active speakers. 

In [None]:
#Extract the 3 most frequent speakers
top_quoters = quotes_per_month.sum(axis=0).nlargest(3).index

#Extracting these from week dataset
top_quotes_per_month = quotes_per_month[top_quoters]
cummulative_top_quotes_per_month = cummulative_quotes_per_month[top_quoters]

fig = plt.figure(figsize=(20, 10))
labels=[]
for quoter in top_quoters:
    plt.plot(top_quotes_per_month.index, top_quotes_per_month[quoter])
    labels.append(quoter)

plt.legend(labels, ncol=2, loc='upper left', fancybox=True, shadow=True,
           fontsize=12)
fig.suptitle('Quotation occurences per month (3 most quoted speakers)', fontsize=30)
plt.xlabel('Year', fontsize=15)
plt.ylabel('Number of quotations', fontsize=15)


In [None]:
top_quoters

As seen above, each speaker has certain periods where they are quoted a lot, causing several spikes in the graph. To smooth this out we will plot the cumulative distribution over the same period.

In [None]:
#Extract the 6 most frequent speakers
top_quoters = quotes_per_month.sum(axis=0).nlargest(6).index

#Extracting these from week dataset
top_quotes_per_month = quotes_per_month[top_quoters]
cummulative_top_quotes_per_month = cummulative_quotes_per_month[top_quoters]

fig = plt.figure(figsize=(20, 10))
labels=[]
for quoter in top_quoters:
    plt.plot(cummulative_top_quotes_per_month.index, cummulative_top_quotes_per_month[quoter])
    labels.append(quoter)

plt.legend(labels, ncol=2, loc='upper left', fancybox=True, shadow=True,
           fontsize=12)
fig.suptitle('Cummulative quotation occurences per week', fontsize=30)
plt.xlabel('Week number', fontsize=15)
plt.ylabel('Number of quotations', fontsize=15)



We immediately recognize the big names *Donald Trump*, *Tim Cook* and *Steve Jobs*. Although Steve Jobs has passed away, he frequently gets quoted in the media. In addition, *Jony Ive* is the former chief designer of Apple, whom left the company in 2019. *Eddy Cue* is the senior vice president of Apple, reporting to Tim Cook. So it is natural that these names appear. *Phil Schiller* is also an apple executive. We initially observe that the most frequent speakers of Apple are indeed a part of the company. Not very shocking.

The problem with the plot above is that it only shows the speakers with the most attributed quotes throughout the whole period. To get a more dynamic view over the whole period, we've made an animation which shows the most frequent speakers up to each point in time. This can be viewed in the README.

### Code for making a .mp4 file with the animation

In [None]:
def summary(values, ranks):
    total_quotes = int(round(values.sum(), -2))
    s = f'Total Quotes - {total_quotes:,.0f}'
    return {'x': .99, 'y': .05, 's': s, 'ha': 'right', 'size': 8}

In [None]:
# Determine whether to make mp4 visualization of most quoted speakers. NOTE: To run this code 'bar_chart_race' and 'ffmpeg' needs to be installed in the environment.
create_mp4 = False
if create_mp4:
    quotes_per_date_plot = bcr.bar_chart_race(cummulative_quotes_per_month, filename='quotes_per_date_plot.mp4',
                                          n_bars=10,
                                          filter_column_colors=True,
                                          period_length=1000,
                                          steps_per_period=20,
                                          bar_label_size=7,
                                          tick_label_size=7,
                                          title='Most frequently quoted speakers about Apple',
                                          period_fmt='%B %Y',  
                                          dpi=500,
                                          shared_fontdict={'family' : 'serif', 'color' : 'black', 'size':'8', 'weight': 'normal'},
                                          cmap='dark12',
                                          period_summary_func=summary)

    

---

## Initial analysis of Apple stock and events

Since a huge part of this project will be to compare the quote data with stock data, we will now look at the Apple stock data, events and quarterly earnings for Apple Inc. 
We start by loading in the data sets.

In [None]:
#Load stock data
stock_data_initial=pd.read_csv('./data/AAPL_2015_to_2020_yahoo_finance.csv')
stock_data_initial.head(10)

In [None]:
#Load quarterly earnings data
quarterly_earnings=pd.read_excel('./data/Earnings_Apple.xlsx')
quarterly_earnings.head(10)

We see that the first 8 rows of the `quarterly_earnings` data frame are irrelevant for this study. We, therefore, remove those.
We also observe that the `quarterly_earnings` and `stock_data_initial` have a different format of their Date column. We need to change one of them to be able to compare the two columns.


In [None]:
#Remove the unnecessary rows in the quarterly_earnings
quarterly_earnings=quarterly_earnings.drop(index=[0,1,2,3,4,5,6,7])

#Change the format of the Earnings Date column to be able to compare it with the Stock dataframe
quarterly_earnings["Earnings Date"]=quarterly_earnings["Earnings Date"].map(lambda x: x.replace(", 12 AMEST", "").replace(",", ""))
quarterly_earnings["Earnings Date"]=quarterly_earnings["Earnings Date"].map(lambda x: dt.datetime.strptime(x, "%b %d %Y").strftime("%Y-%m-%d"))

We now create a new dataframe containing only stock data from earning dates.

In [None]:
#Compare Dates in the two dataframe, based on this we create a new dataframe
apple_earnings_and_stock_data=stock_data_initial[stock_data_initial.Date.isin(quarterly_earnings["Earnings Date"])]
apple_earnings_and_stock_data.sample(5)

In [None]:
#Create a new dataframe only containing stock data from Apple event-days
apple_event_and_stock_data=stock_data_initial[stock_data_initial.Date.isin(apple_event_dates_str)]
apple_event_and_stock_data.head()

In [None]:
#Create a new dataframe only containing stock data from Apple event-days
apple_event_and_stock_data=stock_data_initial[stock_data_initial.Date.isin(apple_event_dates_str)]
apple_earnings_and_stock_data=stock_data_initial[stock_data_initial.Date.isin(quarterly_earnings["Earnings Date"])]

In [None]:
#apple_event_and_stock_data.to_csv('./data/events_dates_with_stock_data.csv')
#apple_earnings_and_stock_data.to_csv('./data/earnings_dates_with_stock_data.csv')

## Add data to dates where the stock market is closed

In [None]:
stock_data_initial.head(3) 

We see that the `stock_data_initial` don't contain all dates due to bank holidays and weekends. This may create conflicts when we want to merge the quotes dataset with the `stock_data_initial`. We will therefore look into different solutions in solving this problem.
First we try to fill the left out dates with the last valid observation.

In [None]:
# Copy and fill missing dates
stock_data_with_closed_days=stock_data_initial.copy()
idx = pd.date_range('2015-01-02', '2020-12-31')
stock_data_with_closed_days.index = pd.DatetimeIndex(stock_data_initial.Date)
stock_data_with_closed_days = stock_data_with_closed_days.reindex(idx, method='ffill').reset_index()
stock_data_with_closed_days = stock_data_with_closed_days.drop(columns=['Date'])
stock_data_with_closed_days = stock_data_with_closed_days.rename(columns={"index": "Date"})

In [None]:
stock_data_with_closed_days.head()

We now see that all dates has observations in it.

We can then plot the event dates, quarterly earning dates and stock price to see if there is any correlation between the dates.

In [None]:
#Plots the dates after adding the 
fig,ax = plt.subplots(figsize=(18,8))
ax.plot(stock_data_with_closed_days.Date,stock_data_with_closed_days.Close, label='Apple stock')
ax.set_xlabel("Date (daily)", size=18)
ax.set_ylabel('Stock closing price ($)', size=18)
plt.xticks(stock_data_with_closed_days.Date[::366])
ax.scatter(apple_event_and_stock_data.Date, apple_event_and_stock_data.Close, c="r", s=100, zorder=3, label='Apple events')
ax.scatter(apple_earnings_and_stock_data.Date, apple_earnings_and_stock_data.Close, c="g", s=100, zorder=3, label='Apple earning dates')
ax.legend(prop={'size': 25})
plt.show()

In addition we plot a graph with the trading volume for each day, to see if there are higher trading volumes around the event and quarterly earning dates.

In [None]:
#Plot each trading day and its volume
fig,ax = plt.subplots(figsize=(18,8))
ax.bar(stock_data_with_closed_days.Date,stock_data_with_closed_days.Volume, label='Apple stock', width=3, align='center')
ax.set_xlabel("Date (daily)",size=15)
ax.set_ylabel('Daily trading volume', size=18)
plt.xticks(stock_data_with_closed_days.Date[::366])
ax.scatter(apple_event_and_stock_data.Date, apple_event_and_stock_data.Volume, c="r", s=100, zorder=3, label='Apple events')
ax.scatter(apple_earnings_and_stock_data.Date, apple_earnings_and_stock_data.Volume, c="g", s=100, zorder=3, label='Apple earning dates')
ax.legend(prop={'size': 20})
plt.show()

Interestingly, it seems that some events occur in periods with high trading volume. In milestone 3 we want to investigate how this correlates to the media attention. 

## Group by week

Quote values may fluctuate heavily from day to day, and it is thus reasonable to aggregate in weeks. When comparing quote values to stocks, we thus need to be able to aggregate stock data on a weekly basis. Instead of imputing data into the `stock_data_initial`, we thus try to average the values per week. 

In [None]:
stock_data_weekly_average=stock_data_initial.copy()

#Create a new dataframe containing the weekly average of the stock price. The method is obtained from
stock_data_weekly_average.Date = pd.to_datetime(stock_data_weekly_average.Date) - pd.to_timedelta(7, unit='d')
stock_data_weekly_average = stock_data_weekly_average.groupby([pd.Grouper(key='Date', freq='W-MON')]).mean().reset_index().sort_values('Date')
stock_data_weekly_average.head()

We then plot the new graph.

In [None]:
#Plots the weekly average
fig,ax = plt.subplots(figsize=(18,8))
ax.plot(stock_data_weekly_average.Date,stock_data_weekly_average.Close, label='Apple stock')
apple_events=ax.set_xlabel("Date (weekly)",size=18)
ax.set_ylabel('Stock closing price ($)', size=18)
ax.scatter(apple_event_and_stock_data.Date, apple_event_and_stock_data.Close, c="r", s=100, zorder=3, label='Apple events')
ax.scatter(apple_earnings_and_stock_data.Date, apple_earnings_and_stock_data.Close, c="g", s=100, zorder=3, label='Apple earnings date')
ax.legend(prop={'size': 25})
plt.show();

<br><br><br><br><br><br><br><br>
# SECTION 2: Sentiment analysis of Apple quotes


## Part 1: Creating the sentiment dataframe
First we are extracting the apple data from the already filtered csv-files. These were created using the Extraction_of_relevant_apple_data.ipynb jupyter notebook.

In [None]:
df_sentiment_analysis = pd.read_csv("data/quotes-2020-apple-filter.csv",sep=";")
df_sentiment_analysis = df_sentiment_analysis.append(pd.read_csv("data/quotes-2019-apple-filter.csv",sep=";"))
df_sentiment_analysis = df_sentiment_analysis.append(pd.read_csv("data/quotes-2018-apple-filter.csv",sep=";"))
df_sentiment_analysis = df_sentiment_analysis.append(pd.read_csv("data/quotes-2017-apple-filter.csv",sep=";"))
df_sentiment_analysis = df_sentiment_analysis.append(pd.read_csv("data/quotes-2016-apple-filter.csv",sep=";"))
df_sentiment_analysis = df_sentiment_analysis.append(pd.read_csv("data/quotes-2015-apple-filter.csv",sep=";"))

In [None]:
df_sentiment_analysis.head()

Changing the dataframe columns to their appropriate data types.

In [None]:
df_sentiment_analysis.quoteID = df_sentiment_analysis.quoteID.astype('string')
df_sentiment_analysis.quotation = df_sentiment_analysis.quotation.astype('string')
df_sentiment_analysis.speaker = df_sentiment_analysis.speaker.astype('string')
df_sentiment_analysis.numOccurrences = df_sentiment_analysis.numOccurrences.astype('int32')

In [None]:
df_sentiment_analysis.dtypes

Sorting the dataframe by date so we get the correct order of the dataframes for the sentiment graph that will be created later

In [None]:
df_sentiment_analysis = df_sentiment_analysis.sort_values('date')
df_sentiment_analysis.head()

For each row in the dataframe we will attach 4 new columns, neg, neu, pos and compound. These values are the negative, neutral, positive and compound values we get from the SentimentIntensityAnalyzer polary scores.

In [None]:
# this cell takes 20 minutes to run, don't run it if not necessary.
# Run it ones and then read from csv

column_names = ['quoteID', 'quotation', 'speaker', 'date', 'numOccurrences','neg', 'neu', 'pos', 'compound']
    
sentiment_info_df = pd.DataFrame(columns = column_names)

print("Total number of rows: ", df_sentiment_analysis.shape[0])
for idx, row in tqdm(df_sentiment_analysis.iterrows()):
    
    # Calculating scores from SentimentIntensityAnalyzer
    new_row_dct = SentimentIntensityAnalyzer().polarity_scores(row['quotation'])
    
    # Creating and writing over value    
    new_row_dct['quoteID'] = row['quoteID']
    new_row_dct['quotation'] = row['quotation']
    new_row_dct['speaker'] = row['speaker']
    new_row_dct['date'] = row['date']
    new_row_dct['numOccurrences'] = row['numOccurrences']
    new_row = pd.DataFrame(new_row_dct, columns=column_names, index=[0])
    sentiment_info_df = sentiment_info_df.append(new_row)


We will also attach a date_clean column to the dataframe which only contains the date and not the timestamp.

In [None]:
# Sorting by datetime
sentiment_info_df_sorted = sentiment_info_df.set_index('quoteID').sort_values('date')

# Clean the date column, such that it only contains date information and not timestamp
sentiment_info_df_sorted['date_clean'] = sentiment_info_df_sorted.apply(lambda x: x['date'][:10],axis=1)
sentiment_info_df_sorted.head()

We will save the current dataframe to csv since the analysis part takes around 20 minutes, so we don't have to redo that part each time we are going to change something in the notebook.

In [None]:
#sentiment_info_df_sorted.to_csv('./data/sentiment_df.csv')

<br><br>
## Part 2: Sentiment analysis
Here we firstly read the dataframe from the csv and remove the fortnite outlier quote, which contains 39 978 occurences. This quote was decided to be removed in the exploratory part of the project.

In [None]:
# Read from csv to recreate a the fresh dataframe
sentiment_df = pd.read_csv('./data/sentiment_df.csv')
#finding index of outlier
print(sentiment_df[sentiment_df.numOccurrences == 39978])
#removing outlier
sentiment_df = sentiment_df.drop(58674)

**Raw datapoints visualized:**  
Firstly we will visualize the datapoints for positive and negative tweets.
- Green points: positive tweets
- Red points: negative tweets

This visualization is mainly to get a feeling of how to points are distributed, but it won't give us a lot of information straight away. It is just too much information for a human being to interpret, we will therefore make an aggregated graph based on the positive and negative points.


In [None]:
fig_sentiment_analysis,ax_sentiment_analysis = plt.subplots(figsize=(25,10))

cutoff_decimal = 0.0


pos_scores_df = sentiment_df[sentiment_df['compound'] > cutoff_decimal]
pos_scores = pos_scores_df['compound']
pos_dates = pos_scores_df['date_clean']

neg_scores_df = sentiment_df[sentiment_df['compound'] < -cutoff_decimal]
neg_scores = -neg_scores_df['compound']
neg_dates = neg_scores_df['date_clean']
ax_sentiment_analysis.set_ylim([0.0,1])
ax_sentiment_analysis.scatter(pos_dates, pos_scores, s=5, color='green')
ax_sentiment_analysis.scatter(neg_dates, neg_scores, s=5, color='red')

loc = plticker.MultipleLocator(base=50) # this locator puts ticks at regular intervals
ax_sentiment_analysis.xaxis.set_major_locator(loc)
plt.title('Visualizing all sentiment intensity scores', fontsize=18)
plt.xlabel('Date', fontsize=18)
plt.ylabel('Sentiment intensity scores for each tweet in dataset', fontsize=17)
ax_sentiment_analysis.tick_params(axis='both', which='major', labelsize=18)
plt.xticks(rotation=90)
plt.show()

## 1 month aggregated sentiment intensity scores

Positive tweets will account for their value (given from SentimentIntensityAnalyzer) times +1 and times numOccurences.  
Negative tweets will account for their value (given from SentimentIntensityAnalyzer) times -1 and times numOccurences. 

In [None]:
agg_month_sentiment = []
agg_month_dates = []

for index, (idx, row) in enumerate(sentiment_df.iterrows()):
    date = row['date_clean']
    if index == 0:
        current_month = date[:7]
        current_size = 0


    tmp_month = current_month
    current_month = date[:7] # the seven first digits of the date
    if tmp_month != current_month:
        agg_month_sentiment.append(current_size)
        current_size = 0
        agg_month_dates.append(current_month)
    
    # updates weighted aggregated value of positive and negative
    
    current_size += row['compound']*row['numOccurrences']

    
    

**For the datastory:**
For our sentiment analysis, we have labeled every quote in our apple dataset with a sentiment compound score between -1.0 and 1.0. Since there are too many quotes to represent visually, we have aggregated quote scores for each month and visualized the aggregated score for that month. These calculations heavily depend on the sentiment tool, so to secure that we don't include neutral quotes or quotes that the tool is hesitant about, we have made an arbitrarily cutoff of 0.6. Therefore, the graph will not include sentiment intensity scores between -0.6 and 0.6 in the calculations.

Comments on the graph: As we can see by the chart there seems to be a disproportionate balance between positive and negative quotes since most months, the aggregated score is far above 0. We can also see a reduction in sentiment intensity scores from late 2019 until early 2020 - which may be caused by covid-19.

In [None]:
fig_1_sentiment, ax_1_sentiment = plt.subplots(figsize=(18,8))
result_values = []
result_mon = []
for mon, agg in zip(agg_month_dates, agg_month_sentiment):
    result_values.append(agg)
    result_mon.append(mon)

sns.set_theme(style="ticks", rc={"axes.spines.right": False, "axes.spines.top": False},
              font_scale=1.8, font="PT Sans")


ax_1_sentiment.plot(result_mon, result_values, '-o', color='#800020')
#plt.title('1 month aggregation scores for tweets', fontsize=18)
plt.xlabel('Date', fontsize=24)
plt.ylabel('Aggregated sentiment intensity score', fontsize=24)

#ax_1_sentiment.plot(agg_month_dates, agg_month_sentiment)
plt.xticks(rotation=45);
for n, label in enumerate(ax_1_sentiment.xaxis.get_ticklabels()):
    if n % 12 != 0:
        label.set_visible(False)

## 1 month aggregated and weighted sentiment intensity scores (based on speaker)
**Name list - multiplicative weight:**
1. Tim Cook  - 2.0
2. Steve Jobs - 1.9
3. Eddy Cue - 1.8
4. Jony Ive - 1.7
5. Donald Trump - 1.6
6. Phill Schiller - 1.5
7. Jeff Williams - 1.4
8. Steve Wozniak - 1.3
9. Ben Wood  -  1.2
10. Brian White  - 1.1

Based on our "Most frequently quoted speakers about Apple" bar chart race, we will give quotes that has a speaker to any of the people from the top 10 list some additional weight. 

In [None]:
speaker_weights = {
    "Tim Cook": 2,
    "Steve Jobs": 1.9,
     "Eddy Cue" : 1.8,
     "Jony Ive" : 1.7,
     "Donald Trump" : 1.6,
     "Phill Schiller" : 1.5,
     "Jeff Williams" : 1.4,
     "Steve Wozniak" : 1.3,
     "Ben Wood" : 1.2,
     "Brian White" : 1.1,
}

In this code block we do the same as we did in the one above, the only difference is that we scale the datapoint if it is included in the speaker weights dictionary.

In [None]:
agg_month_speaker_weight_sentiment = []
agg_month_speaker_weight_dates = []

for index, (idx, row) in enumerate(sentiment_df.iterrows()):
    date = row['date_clean']
    speaker = row['speaker']
    if index == 0:
        current_speaker_weight_month = date[:7]
        current_speaker_weight_size = 0


    tmp_speaker_weight_month = current_speaker_weight_month
    current_speaker_weight_month = date[:7] # the seven first digits of the date
    if tmp_speaker_weight_month != current_speaker_weight_month:
        agg_month_speaker_weight_sentiment.append(current_speaker_weight_size)
        current_speaker_weight_size = 0
        agg_month_speaker_weight_dates.append(current_speaker_weight_month)
    
    speaker_mult = 1
    if speaker in speaker_weights:
        speaker_mult = speaker_weights[speaker]
        
    # updates weighted aggregated value of positive and negative
    if abs(row['compound']) > 0.6:
        current_speaker_weight_size += row['compound']*row['numOccurrences']*speaker_mult

    
    

Here we have the 1 month aggregations scores which are multiplied by the speaker weights, which gives extra weight to the important speakers.

In [None]:
fig_2_sentiment, ax_2_sentiment = plt.subplots(figsize=(18,8))
result_values = []
result_mon = []
for mon, agg in zip(agg_month_speaker_weight_dates, agg_month_speaker_weight_sentiment):
    result_values.append(agg)
    result_mon.append(mon)

        

ax_2_sentiment.scatter(result_mon, result_values)

ax_2_sentiment.plot(result_mon, result_values, '-o', color='green')
plt.title('1 month aggregation scores for tweets with additional speaker weights', fontsize=18)
plt.xlabel('Date', fontsize=18)
plt.ylabel('Aggregated sentiment intensity score', fontsize=18)


plt.xticks(rotation=90);

<br><br>
## 1 week aggregated sentiment intensity scores
In this graph we will aggregate on each week, i.e. every 7 day we will make a data point on the graph. It is calculated counting one for each time the days change, when i reaches seven we "submit" the aggregated scores to the agg_week_sentiment list, which we will use for plotting. This way we get the aggregated scores for a 7 day period i.e. a week.

In [None]:
agg_week_sentiment_weekly = []
agg_week_dates_weekly = []
day_counter_weekly = 0
for index, (idx, row) in enumerate(sentiment_df.iterrows()):
    date = row['date_clean']
    if index == 0:
        current_day_weekly = date[:10]
        current_size_weekly = 0


    tmp_day_weekly = current_day_weekly
    current_day_weekly = date[:10] # the seven first digits of the date
    if tmp_day_weekly != current_day_weekly:
        day_counter_weekly += 1
    if day_counter_weekly == 7:
        agg_week_sentiment_weekly.append(current_size_weekly)
        agg_week_dates_weekly.append(current_day_weekly)
        current_size_weekly = 0
        day_counter_weekly = 0

    # updates weighted aggregated value of positive and negative
    current_size_weekly += row['compound']*row['numOccurrences']

    

In [None]:
fig_3_sentiment, ax_3_sentiment = plt.subplots(figsize=(18,8))
result_week_values = []
result_week = []
for week, agg in zip(agg_week_dates_weekly, agg_week_sentiment_weekly):
    result_week_values.append(agg)
    result_week.append(week)

        


ax_3_sentiment.plot(result_week, result_week_values, color='green')
loc_weekly = plticker.MultipleLocator(base=5) # this locator puts ticks at regular intervals
ax_3_sentiment.xaxis.set_major_locator(loc_weekly)
plt.title('1 week aggregation scores for tweets with additional speaker weights', fontsize=18)
plt.xlabel('Date', fontsize=18)
plt.ylabel('Aggregated sentiment intensity score', fontsize=18)

plt.xticks(rotation=90);

<br><br>
## 1 day aggregated sentiment intensity scores
In this graph we will aggregated quotes from each day, i.e. every unique day we will make a data point on the graph. It is calculated by "submitting" the scores every time the day changes in dateframe. This way we get the aggregated scores for each unique day.

In [None]:
agg_sentiment_daily = []
agg_dates_daily = []

for index, (idx, row) in enumerate(sentiment_df.iterrows()):
    date = row['date_clean']
    if index == 0:
        current_day_daily = date[:10]
        current_size_daily = 0


    tmp_day_daily = current_day_daily
    current_day_daily = date[:10] # the seven first digits of the date
    if tmp_day_daily != current_day_daily:
        agg_sentiment_daily.append(current_size_daily)
        current_size_daily = 0
        agg_dates_daily.append(current_day_daily)

    # updates weighted aggregated value of positive and negative
    current_size_daily += row['compound']*row['numOccurrences']

    

In [None]:
fig_4_sentiment, ax_4_sentiment = plt.subplots(figsize=(18,8))
result_daily_values = []
result_daily = []
for day, agg in zip(agg_dates_daily, agg_sentiment_daily):
    result_daily_values.append(agg)
    result_daily.append(day)

        



ax_4_sentiment.plot(result_daily, result_daily_values, color='green')
loc_daily = plticker.MultipleLocator(base=30) # this locator puts ticks at regular intervals
ax_4_sentiment.xaxis.set_major_locator(loc_daily)
plt.title('1 day aggregation scores for tweets with additional speaker weights', fontsize=18)
plt.xlabel('Date', fontsize=18)
plt.ylabel('Aggregated sentiment intensity score', fontsize=18)

plt.xticks(rotation=90);

<br><br>
## Part 3: Word cloud
In this section we will look at the word clouds for a positive and negative quotes

In [None]:
all_quotation_text = ""

for index, (idx, row) in tqdm(enumerate(sentiment_df.iterrows())):

    all_quotation_text += " " + row['quotation']

        
for string in ['Apple', 'apple', 'iPad', 'iPhone', 'Apple watch', 'apples', 'Apples', 'iPhones', ' S ', ' s ']:
    all_quotation_text = all_quotation_text.replace(string, "")



**Removing words that are products:**

In [None]:
stopwords = list(STOPWORDS)

### Word cloud for positive quotations

In [None]:
mask = np.array(Image.open("./data/apple_logo.png"))


word_cloud = WordCloud(stopwords=stopwords, background_color="white", mask=mask, mode="RGBA", max_words=500)
wc_all_quotes = word_cloud.generate(all_quotation_text)
plt.figure()
image_colors = ImageColorGenerator(mask)
plt.figure(figsize=[20,20])
plt.imshow(wc_all_quotes.recolor(color_func=image_colors), interpolation='bilinear')
plt.axis("off");
plt.savefig("./data/apple_word_cloud.png", format="png") 

plt.show()


<br><br><br><br><br><br>
# SECTION 3: Stock sentiment analysis

## Preprocessing of data from stock analysis

#### Load stock_data

In [None]:
stock_data=pd.read_csv('./data/AAPL_2015_to_2020_yahoo_finance.csv')
apple_event_and_stock_data=pd.read_csv('./data/events_dates_with_stock_data.csv')
apple_earnings_and_stock_data=pd.read_csv('./data/earnings_dates_with_stock_data.csv')

#### Remove unnecessary data

Remove the stock data after 16-04-2020 due to the lack of sentiment data after that date.

In [None]:
stock_data=stock_data[~(stock_data['Date'] > '2020-04-16')]
apple_event_and_stock_data=apple_event_and_stock_data[~(apple_event_and_stock_data['Date'] > '2020-04-16')]
apple_earnings_and_stock_data=apple_earnings_and_stock_data[~(apple_earnings_and_stock_data['Date'] > '2020-04-16')]

#### Convert the Date values into datetime

In [None]:
stock_data["Date"]=stock_data['Date'].apply(lambda x: dt.datetime.strptime(x[:10], '%Y-%m-%d'))

### Calculate weekly average

When plotting the stock price development, we want to use the weekly average price of the stock. This is due to the missing stock data for bank holidays and weekends.

In [None]:
stock_data_weekly_average=stock_data.copy()

#Create a new dataframe containing the weekly average of the stock price.
stock_data_weekly_average.Date = pd.to_datetime(stock_data_weekly_average.Date) - pd.to_timedelta(7, unit='d')
stock_data_weekly_average = stock_data_weekly_average.groupby([pd.Grouper(key='Date', freq='W-MON')]).mean().reset_index().sort_values('Date')

### Calculate monthly change

Measurre the monthly change of the stock price. This will be used later when comparing the monthly price change and monthly sentiment score.

In [None]:
monthly_open_prize=stock_data.groupby(pd.Grouper(key='Date', freq='M')).first()['Open']
monthly_close_price=stock_data.groupby(pd.Grouper(key='Date', freq='M')).last()['Close']

stock_diff_monthly=pd.DataFrame(((monthly_close_price-monthly_open_prize)/monthly_open_prize)*100, columns=['Price_difference'])

`stock_diff_monthly` displays the monthly price change off the Apple-stock.

### Calculate weekly change

Measurre the weekly change of the stock price. This will be used later when comparing the weekly price change and weekly sentiment score.

In [None]:
weekly_open_prize=stock_data.groupby(pd.Grouper(key='Date', freq='W')).first()['Open']
weekly_close_price=stock_data.groupby(pd.Grouper(key='Date', freq='W')).last()['Close']

stock_diff_weekly=pd.DataFrame(((weekly_close_price-weekly_open_prize)/weekly_open_prize)*100, columns=['Price_difference'])

`stock_diff_weekly` displays the weekly price change off the Apple-stock.

## Preprocessing of data from sentiment analysis

#### Load the sentiment score

The sentiment score calculated in `sentiment_analysis.ipynb`is loaded to be used later in addition to the stock prices already loaded.

In [None]:
sentiment_df_section_stock = pd.read_csv('./data/sentiment_df.csv')

### Monthly sentiment

We use the same method as displayed in `sentiment_analysis.ipynb`to calculate the monthly sentiment data.

In [None]:
# Based on how much they contribute increase size of scatter plot circle
agg_month_sentiment = []
agg_month_dates = []

for index, (idx, row) in enumerate(sentiment_df_section_stock.iterrows()):
    date = row['date_clean']
    if index == 0:
        current_month = date[:7]
        current_size = 0

    tmp_month = current_month
    current_month = date[:7] # the seven first digits of the date
    if tmp_month != current_month:
        agg_month_sentiment.append(current_size)
        current_size = 0
        agg_month_dates.append(current_month)

    # updates weighted aggregated value of positive and negative
    current_size += row['compound']*row['numOccurrences']

result_values = []
result_mon = []
for mon, agg in zip(agg_month_dates, agg_month_sentiment):
    result_values.append(agg)
    result_mon.append(mon)

We create a monthly aggregated sentiment dataframe that will be useful later when merging stock data and sentiment data.

In [None]:
sentiment_monthly = pd.DataFrame(zip(result_mon, result_values), columns =['Date', 'Sentiment_score'])
sentiment_monthly['Date'] = sentiment_monthly['Date'].apply(lambda x: dt.datetime.strptime(x[:10], '%Y-%m'))
sentiment_monthly.head()

### Weekly sentiment

To make it easier to comapre the weekly sentiment to the weekly stock price, we remove the first four days of the sentiment data. This makes it easier to merge the two dataframes later.

In [None]:
sentiment_df2=sentiment_df_section_stock.copy()
sentiment_df2=sentiment_df2[~(sentiment_df2['date_clean'] < '2015-01-04')]

In [None]:
agg_week_sentiment_weekly = []
agg_week_dates_weekly = []
day_counter_weekly = 0
for index, (idx, row) in enumerate(sentiment_df2.iterrows()):
    date = row['date_clean']
    if index == 0:
        current_day_weekly = date[:10]
        current_size_weekly = 0


    tmp_day_weekly = current_day_weekly
    current_day_weekly = date[:10] # the seven first digits of the date
    if tmp_day_weekly != current_day_weekly:
        day_counter_weekly += 1
    if day_counter_weekly == 7:
        agg_week_sentiment_weekly.append(current_size_weekly)
        agg_week_dates_weekly.append(current_day_weekly)
        current_size_weekly = 0
        day_counter_weekly = 0

    # updates weighted aggregated value of positive and negative
    current_size_weekly += row['compound']*row['numOccurrences']


result_week_values = []
result_week = []
for week, agg in zip(agg_week_dates_weekly, agg_week_sentiment_weekly):
    result_week_values.append(agg)
    result_week.append(week)

In [None]:
sentiment_weekly=pd.DataFrame(zip(result_week, result_week_values), columns =['Date', 'Sentiment_score'])
sentiment_weekly['Date'] = pd.to_datetime(sentiment_weekly['Date'])

sentiment_weekly.head()

## Correlation between stock and sentiment (Months)

We have now preprocessed the data and can start investigate if there exists any relationship between the stock price and the sentiment. We start by looking at the difference in monthly stock prices and compare it to the aggregated sentiment score for the same month.

### Stock price and sentiment for the current month

In [None]:
fig_stock_sentiment, (ax1_stock_sentiment, ax2_stock_sentiment) = plt.subplots(2, 1, sharex=True, figsize=(20,10))
ax1_stock_sentiment.plot(stock_data_weekly_average.Date, stock_data_weekly_average.Close, label='Apple stock')
ax1_stock_sentiment.set_ylabel('Stock closing price ($)', size=18)
ax1_stock_sentiment.scatter(apple_event_and_stock_data.Date, apple_event_and_stock_data.Close, c='r', s=100, zorder=3, label='Apple events')
ax1_stock_sentiment.scatter(apple_earnings_and_stock_data.Date, apple_earnings_and_stock_data.Close, c='g', s=100, zorder=3, label='Apple earnings date')
ax1_stock_sentiment.legend(prop={'size': 25})

ax2_stock_sentiment.plot(sentiment_monthly.Date, sentiment_monthly.Sentiment_score, color='green')
ax2_stock_sentiment.set_ylabel('Aggregated sentiment intensity score', fontsize=18)
plt.show();

 By just looking at the graphs, it is hard to tell if the sentiment and the stock price is related in any way. The `sentiment_monthly` use the first date of the next month to calculate sentiment, while `stock_diff_monthly` use the last day of this month used to calculate stock difference as seen below:

In [None]:
print("sentiment_monthly: " + str(sentiment_monthly.Date[0]) + ", stock_diff_monthly: " + str(stock_diff_monthly.index[0]))

We substract one day from the `sentiment_monthly` such that we easily can merge the two dataframes together based on the `Date`-values.

In [None]:
sentiment_monthly['Date']=sentiment_monthly['Date']-DateOffset(days=1)

 We therefore tries to merge the two dataframes by the `Date`-column to further explore the relationships between the sentiment and price.

In [None]:
stock_price_sentiment_monthly = pd.merge(stock_diff_monthly, sentiment_monthly, how='inner', on = 'Date')

In [None]:
stock_price_sentiment_monthly.head()

We then plot the development of the stock price and the development of the sentiment to see if we see any obvious relationships between the two.

In [None]:
sns.set_theme(style="whitegrid", rc={"axes.spines.right": False, "axes.spines.top": False},
              font_scale=2)

In [None]:
fig_stock_sentiment_2, ax_stock_sentiment_2=plt.subplots(figsize=(20,10))
ax_stock_sentiment_2=sns.regplot(x=stock_price_sentiment_monthly.Price_difference, y=stock_price_sentiment_monthly.Sentiment_score)
ax_stock_sentiment_2.set_xlabel('Price difference from start to end of a month (%)', size=18)
ax_stock_sentiment_2.set_ylabel('Sentiment score for current month', size=18)

We see a slightly positive relationship by looking at the correlation between the stock price and the sentiment score. From this, it might seem that it is more likely to rise in months where the Apple stock gets a lot of positive media attention. However, these findings are not very useful when predicting future stock prices because we would not know the sentiment before the end of the month.

In [None]:
correlation_coefficient, p_value=pearsonr(stock_price_sentiment_monthly.Sentiment_score, stock_price_sentiment_monthly.Price_difference)

print(f"Between monthly sentiment and monthly price difference: Correlation coefficient: {correlation_coefficient} | p-value: {p_value}")

With a p-value of $0.40$, it is almost random if a stock will go up or down.

### Stock price for current month and sentiment from previous month

We now want to see if there exists a correlation between the previous month sentiment score and this months stock price. This is because such a correlation might help us say something about the most likely future development of the stock.

In [None]:
sentiment_previous_month=sentiment_monthly.copy()

#Shifts the date one month back in time
sentiment_previous_month['Date']=sentiment_previous_month['Date']-DateOffset(months=1, days=5)+MonthEnd(1)

#Merge the stock_diff_monthly with the sentiment_previous_month. Each row represent the sentiment for previous month and the stock_price for this month
stock_sentiment_previous_month=pd.merge(stock_diff_monthly, sentiment_previous_month, how='inner', on = 'Date')

In [None]:
fig_stock_sentiment_3, ax_stock_sentiment_3=plt.subplots(figsize=(20,10))
sns.regplot(x=stock_sentiment_previous_month.Price_difference, y=stock_sentiment_previous_month.Sentiment_score)
ax_stock_sentiment_3.set_xlabel('Price difference from start to end of month (%)', size=18)
ax_stock_sentiment_3.set_ylabel('Sentiment score for previous month', size=18);

From the graph it might seem to be a negative relationship between the sentiment for the previous month and the price for this month. Based on this we can expect that a stock is more likely to increase in price following a month where we have seen a low sentiment score and vice versa. We check how likely that this relationship exists using a t-test.

In [None]:
correlation_coefficient, p_value=pearsonr(stock_sentiment_previous_month.Sentiment_score, stock_sentiment_previous_month.Price_difference)

print(f"Correlation between monthly price difference and monthly sentiment: Correlation coefficient: {correlation_coefficient} | p-value: {p_value}")

The test also here shows no statistically significant relationship between the prices of this month and the sentiment score from last month. We are thus still without findings when looking for a correlation between the quotes and the stock price.

#### Boxplot of the findings

In [None]:
stock_sentiment_previous_month.loc[stock_sentiment_previous_month['Price_difference']<=-15, 'Category'] = 1
stock_sentiment_previous_month.loc[stock_sentiment_previous_month['Price_difference'].between(-15,-10), 'Category'] = 2
stock_sentiment_previous_month.loc[stock_sentiment_previous_month['Price_difference'].between(-10,-5), 'Category'] = 3
stock_sentiment_previous_month.loc[stock_sentiment_previous_month['Price_difference'].between(-5,0), 'Category'] = 4
stock_sentiment_previous_month.loc[stock_sentiment_previous_month['Price_difference'].between(0,5), 'Category'] = 5
stock_sentiment_previous_month.loc[stock_sentiment_previous_month['Price_difference'].between(5,10), 'Category'] = 6
stock_sentiment_previous_month.loc[stock_sentiment_previous_month['Price_difference'].between(10,15), 'Category'] = 7
stock_sentiment_previous_month.loc[stock_sentiment_previous_month['Price_difference']>=15, 'Category'] = 8

In [None]:
sns.set_theme(style="white", rc={"axes.spines.right": False, "axes.spines.top": False},
              font_scale=2)

In [None]:
fig_stock_sentiment_4, ax_stock_sentiment_4 = plt.subplots(figsize=(15,8))
plot=sns.boxplot(x='Category', y='Sentiment_score', data=stock_sentiment_previous_month, hue='Category', saturation=0.5)
plt.legend([],[], frameon=False)           
ax_stock_sentiment_4.set_ylabel('Sentiment score')
ax_stock_sentiment_4.set_xlabel('Price difference')
positions = (0, 1, 2, 3, 4, 5, 6,)
labels = ('x < -15', '-15 < x < -10', '-10 < x < -5', '-5 < x < 0', '0 < x < 5', '5 < x < 10', '10 < x < 15')
plt.xticks(positions, labels, rotation=40);

### Stock price for current month and sentiment for following month

Another interesting question to look into is if a stock rises more before it gets all the media i.e. that people just hear the good news when it is already to late to buy the stock. Therefore we want to see the correlation between the stock price and its sentiment score the following month.

In [None]:
sentiment_following_month=sentiment_monthly.copy()

In [None]:
#Shifts all dates to the following month
sentiment_following_month['Date']=sentiment_following_month['Date']+DateOffset(days=5)+MonthEnd(1)

In [None]:
stock_sentiment_following_month=pd.merge(stock_diff_monthly, sentiment_following_month, how='inner', on = 'Date')

In [None]:
sns.set_theme(style="whitegrid", rc={"axes.spines.right": False, "axes.spines.top": False},
              font_scale=2)

In [None]:
fig_stock_sentiment_5, ax_stock_sentiment_5=plt.subplots(figsize=(20,10))
ax_stock_sentiment_5=sns.regplot(x=stock_sentiment_following_month.Price_difference, y=stock_sentiment_following_month.Sentiment_score)
ax_stock_sentiment_5.set_xlabel('Price difference from start to end of month (%)', size=18)
ax_stock_sentiment_5.set_ylabel('Sentiment score for next month', size=18);

In [None]:
correlation_coefficient, p_value=pearsonr(stock_sentiment_following_month.Sentiment_score, stock_sentiment_following_month.Price_difference)
print(f"Between monthly price difference and sentiment following week: Correlation coefficient: {correlation_coefficient} | p-value: {p_value}")

Here as well, we see no relationship between price and sentiment score.

## Correlation between stock and sentiment (weeks)

So far, we have no results of interest. Therefore, we want to narrow the periods to see if they give any better results than above.

### Stock price and sentiment for current week

In [None]:
stock_sentiment_weekly = pd.merge(stock_diff_weekly, sentiment_weekly, how='inner', on = 'Date')

In [None]:
correlation_coefficient, p_value=pearsonr(stock_sentiment_weekly.Sentiment_score, stock_sentiment_weekly.Price_difference)
print(f"Correlation between monthly price difference and monthly sentiment: Correlation coefficient: {correlation_coefficient} | p-value: {p_value}")

### Stock price for current week and sentiment from previous week

In [None]:
sentiment_previous_week=sentiment_weekly.copy()

#Shifts the date one week back in time
sentiment_previous_week['Date']=sentiment_previous_week['Date']-DateOffset(days=7)

#Merge the sentiment_previous_week with the stock_diff_weekly. Each row represent the sentiment for previous month and the stock_price for this month
stock_sentiment_previous_week=pd.merge(stock_diff_weekly, sentiment_previous_week, how='inner', on = 'Date')

In [None]:
correlation_coefficient, p_value=pearsonr(stock_sentiment_previous_week.Sentiment_score, stock_sentiment_previous_week.Price_difference)
print(f"Correlation between weekly price difference and sentiment score the previous week: Correlation coefficient: {correlation_coefficient} | p-value: {p_value}")

### Stock price for current week and sentiment for following week

In [None]:
sentiment_following_week=sentiment_weekly.copy()

#Shifts the date one week back in time
sentiment_following_week['Date']=sentiment_following_week['Date']+DateOffset(days=7)

#Merge the sentiment_previous_week with the stock_diff_weekly. Each row represent the sentiment for previous month and the stock_price for this month
stock_sentiment_following_week=pd.merge(stock_diff_weekly, sentiment_following_week, how='inner', on = 'Date')

In [None]:
correlation_coefficient, p_value=pearsonr(stock_sentiment_following_week.Sentiment_score, stock_sentiment_following_week.Price_difference)
print(f"Correlation between weekly price difference and sentiment score the following week: Correlation coefficient: {correlation_coefficient} | p-value: {p_value}")

As we can see from the p-value above, there is a significant correlation between the stock price and its sentiment the following week.

In [None]:
fig_stock_sentiment_6, ax_stock_sentiment_6=plt.subplots(figsize=(20,10))
ax_stock_sentiment_6=sns.regplot(x=stock_sentiment_following_week.Price_difference, y=stock_sentiment_following_week.Sentiment_score)
ax_stock_sentiment_6.set_xlabel('Price difference from start to end of month (%)', size=18)
ax_stock_sentiment_6.set_ylabel('Sentiment score for next month', size=18);

## Conclusion

After looking at the weekly and the monthly values of the price change and the sentiment score, we found no statistically significant correlation that can be used to predict the stock price. We found that the stock price might be used to predict the sentiment for the following week. However, this is not very useful when predicting stock prices.

<br><br><br><br><br><br>
# SECTION 4: Event analysis

## Apple product launch analysis
Loading the data and preprocessing.

In [None]:
df_event_analysis = pd.read_csv("data/quotes-2020-apple-filter.csv",sep=";")
df_event_analysis = df_event_analysis.append(pd.read_csv("data/quotes-2019-apple-filter.csv",sep=";"))
df_event_analysis = df_event_analysis.append(pd.read_csv("data/quotes-2018-apple-filter.csv",sep=";"))
df_event_analysis = df_event_analysis.append(pd.read_csv("data/quotes-2017-apple-filter.csv",sep=";"))
df_event_analysis = df_event_analysis.append(pd.read_csv("data/quotes-2016-apple-filter.csv",sep=";"))
df_event_analysis = df_event_analysis.append(pd.read_csv("data/quotes-2015-apple-filter.csv",sep=";"))

#List of dates for the apple events
apple_event_dates_str=["2015-03-09","2015-06-10","2015-09-09",
"2016-03-21","2016-06-15","2016-09-07", "2016-10-27",
"2017-06-07", "2017-09-12",
"2018-03-27","2018-06-06", "2018-09-12", "2018-10-30",
"2019-03-25","2019-06-05","2019-09-10","2019-12-02",
"2020-06-24","2020-09-15","2020-10-13","2020-11-10"]

# load sentiment dataframe
sentiment_df_event_analysis = pd.read_csv('./data/sentiment_df.csv')

#apple_event_dates_pd = pd.DataFrame({'Date':[dt.datetime.strptime(date, "%Y-%m-%d").date() for date in apple_event_dates_str]})
apple_event_dates=[dt.datetime.strptime(date, "%Y-%m-%d").date() for date in apple_event_dates_str]

# Clean the date column, such that it only contains date information and not timestamp
df_event_analysis['date_clean'] = df_event_analysis.apply(lambda x: x['date'][:10],axis=1)
df_event_analysis['date_clean_datetime'] = df_event_analysis['date_clean'].apply(lambda x: dt.datetime.strptime(x[:10], '%Y-%m-%d').date())

# Clean the date column, such that it only contains date information and not timestamp
df_event_analysis['date_clean'] = df_event_analysis.apply(lambda x: x['date'][:10],axis=1)
df_event_analysis['date_clean_datetime'] = df_event_analysis['date_clean'].apply(lambda x: dt.datetime.strptime(x[:10], '%Y-%m-%d').date())
sentiment_df_event_analysis['date_clean_datetime'] = sentiment_df_event_analysis['date_clean'].apply(lambda x: dt.datetime.strptime(x[:10], '%Y-%m-%d').date())


# Drop outlier
df_event_analysis = df_event_analysis.drop(df_event_analysis.loc[df_event_analysis.numOccurrences == 39978].index)

In [None]:
for quote in sentiment_df_event_analysis[sentiment_df_event_analysis["date_clean"].isin(apple_event_dates_str)]["quotation"].sample(10):
    print(quote)
    print('------------------------------------------------------')

Let's start by looking at how the number of apple related citations are related to product launches.

In [None]:
# slice the data frame 
time = df_event_analysis.groupby(['date_clean']).sum().index
quote_num = df_event_analysis.groupby(['date_clean']).sum()['numOccurrences']
fig_event_analysis, ax_event_analysis = plt.subplots(figsize=(18,8))
ax_event_analysis.plot(time,quote_num, label= "Quote occurrences")
ax_event_analysis.set_xlabel("dates",size=15)
ax_event_analysis.scatter(time[time.isin(apple_event_dates_str)],quote_num[time.isin(apple_event_dates_str)], c="r", s=100, zorder=3, label='Apple events')
plt.xticks(time[::30],rotation=90)
ax_event_analysis.set_ylabel("Quotation occurrences",size=15)
ax_event_analysis.set_title("Number of daily Apple-related Citations 2015 through 2020",size=18)
ax_event_analysis.legend(prop={'size': 20})
plt.show()

**Quantifying the media attention**

To quantify the media attention for a given Apple event, the following metrics are formulated:
- **Attention**: The number of weekly quote occurrences.
- **Baseline attention**: Average attention throughout the time period.
- **Event attention**: Avg. of quote occurrences through event week, prior and post week.
- **Event attention increase %**: (Event attention - Baseline attention) / Baseline attention.


To be able to compute these metrics, as well as make further analysis possible, we create the following dataframe:

*Columns:* \
Event-date, attention_week3_pre, attention_week2_pre, attention_week1_pre, attention_week0, attention_week1_post, attention_week2_post, attention_week3_post


In [None]:
df_events = pd.DataFrame()
events = []
for date in apple_event_dates:
    if (date.year == 2019 and date.month < 12) or date.year < 2020:
        events.append(date)
df_events['event_date'] = events

# group data frame by week and year to get weekly attention numbers
df_event_analysis['week'] = df_event_analysis.apply(lambda x: str(x['date_clean_datetime'].isocalendar()[1]), axis=1)
df_event_analysis['year'] = df_event_analysis.apply(lambda x: str(x['date_clean_datetime'].isocalendar()[0]), axis=1)
df_event_analysis['yearweek'] = df_event_analysis['week'] + df_event_analysis['year']
df_attention = df_event_analysis.groupby(['year','week']).sum().reset_index()


# group sentiment data frame
sentiment_df_event_analysis['week'] = sentiment_df_event_analysis.apply(lambda x: str(x['date_clean_datetime'].isocalendar()[1]), axis=1)
sentiment_df_event_analysis['year'] = sentiment_df_event_analysis.apply(lambda x: str(x['date_clean_datetime'].isocalendar()[0]), axis=1)
sentiment_week = sentiment_df_event_analysis.groupby(['year','week']).mean().reset_index()

def attention_query(date, num_weeks):
    "adds weeks to date and returns the attention the corresponding week and year"
    new_date = date + dt.timedelta(weeks=num_weeks)
    year = str(new_date.isocalendar()[0])
    week = str(new_date.isocalendar()[1])
    count = df_attention.loc[(df_attention.week == week) & (df_attention.year == year)].numOccurrences.item()
    return count

def sentiment_query(date, num_weeks):
    "adds weeks to date and returns the sentiment the corresponding week and year"
    new_date = date + dt.timedelta(weeks=num_weeks)
    year = str(new_date.isocalendar()[0])
    week = str(new_date.isocalendar()[1])
    sentiment = sentiment_week.loc[(sentiment_week.week == week) & (sentiment_week.year == year)].compound.item()
    return sentiment

# Create columns for all events
for i in [-3,-2,-1,0,1,2,3]:
    df_events[f"""attention_week{i}{'_pre' if i < 0 else '_post' if i > 0 else ''}"""] = df_events.apply(lambda x: attention_query(x['event_date'], i),axis=1)
    df_events[f"""sentiment_week{i}{'_pre' if i < 0 else '_post' if i > 0 else ''}"""] = df_events.apply(lambda x: sentiment_query(x['event_date'], i),axis=1)

attention_columns =  [ 'attention_week-3_pre', 'attention_week-2_pre',  'attention_week-1_pre', 'attention_week0','attention_week1_post', 'attention_week2_post','attention_week3_post']
sentiment_columns = ['sentiment_week-3_pre','sentiment_week-2_pre','sentiment_week-1_pre','sentiment_week0','sentiment_week1_post','sentiment_week2_post', 'sentiment_week3_post']



Let's visualize the typical distribution of media attention

In [None]:
fig,ax = plt.subplots(figsize=(18,8))
sns.set(font_scale=2)
sns.set_theme(style="white", rc={"axes.spines.right": False, "axes.spines.top": False},
              font_scale=2)
ax = sns.boxplot(data=df_events[attention_columns],
                     saturation=0.5)
#ax.set_title("Media attention before, during and after an Apple event")
ax.set_ylabel("Apple related quotes")
positions = (0,1, 2, 3, 4, 5, 6)
labels = ("3 weeks prior", "2 weeks prior", "1 week prior","Event week", "1 week after", "2 weeks after", "3 weeks after")
plt.xticks(positions, labels, rotation=22.5);
#sns.despine(fig)


Looking at the medians, we see that the media attention rises until the week of the event where it peaks and then gradually declines in the following weeks. This shows that Apple is able to build anticipation in the media when launching new products.

In [None]:
fig,ax = plt.subplots(figsize=(18,8))
ax = sns.boxplot(data=df_events[sentiment_columns],
                     saturation=0.5)
ax.set_title("Sentiment of quotes before, during and after an Apple event")
ax.set_ylabel("Sentiment of Apple related quotes")
positions = (0,1, 2, 3, 4, 5, 6)
labels = ("3 weeks prior", "2 weeks prior", "1 week prior","Event week", "1 week after", "2 weeks after", "3 weeks after")
plt.xticks(positions, labels);

Above we see a boxplot of the average sentiment of the quotes in the weeks prior to and after an apple event. The sentiment value ranges from -1 to 1, with 1 being very positive and -1 being very negative. Across all weeks, we see that the sentiment of the quotes typically lies around 0.2, indicidating a slight positive sentiment in the quotes but mostly neutral. Although the medians do not differ much, the median of the event week is still the highest value. From this we can conclude that Apple generally has a stable reputation in the media, with slightly positive quotes made about them on average.

In [None]:
# Compute baseline
baseline = df_attention.numOccurrences.mean()
df_events['baseline'] = baseline

# Compute average of attention in the weeks prior and after an event 
columns = ['attention_week-3_pre', 'attention_week-2_pre','attention_week-1_pre', 'attention_week0', 'attention_week1_post', 'attention_week2_post', 'attention_week3_post']
df_events['event_attention'] = df_events[columns].mean(axis=1)
df_events['attention_increase'] =  100 * (df_events.event_attention - df_events.baseline) / df_events.baseline
df_events['color'] = df_events.apply(lambda x: 'indianred' if x['attention_increase'] < 0  else 'seagreen',axis=1)
df_events

###

Now looking at how different apple events have deviated from the attention average

In [None]:
fig,ax = plt.subplots(figsize=(18,8))
plt.bar(x=apple_event_dates_str[:17],height=df_events.attention_increase,color=df_events.color)
plt.xticks(rotation=90)
plt.xlabel("Event dates")
plt.ylabel("Attention increase from baseline (%)")
plt.title("Apple events' deviation from attention baseline")
plt.text(10.9, 10, "iPhone XS, XR, Apple Watch", fontsize=14,rotation=90,color="black")
plt.text(7.9, 10, "iPhone 8, iPhone X", fontsize=14,rotation=90,color="white")
plt.text(4.9, 10, "iPhone 7, AirPods", fontsize=14,rotation=90,color="white")
plt.text(1.9, 10, "iPhone 6S,", fontsize=14,rotation=90,color="white")
plt.text(1.9, 65, "iPad Pro/Mini", fontsize=14,rotation=90,color="black")
plt.text(5.9, -90, "Macbook Pro", fontsize=14,rotation=90,color="white");
plt.text(2.9, -90, "iPhone SE, iPad Pro", fontsize=14,rotation=90,color="white");
plt.text(3.9, -50, "Software", fontsize=14,rotation=90,color="white");
plt.text(12.9, -100, "Apple TV+/Card/Arcade", fontsize=14,rotation=90,color="black");
plt.text(14.9, -105, "iPhone 11, Apple Watch", fontsize=14,rotation=90,color="black");

In [None]:
fig,ax = plt.subplots(figsize=(12,18))
plt.barh(y=apple_event_dates_str[:17],width=df_events.attention_increase,color=df_events.color)
#plt.xticks(rotation=90)
plt.xlabel("Event dates")
plt.ylabel("Attention increase from baseline (%)")
plt.title("Apple events' deviation from attention baseline")
plt.text(10, 10.9, "iPhone XS, XR, Apple Watch", fontsize=14, color="black")
plt.text(10, 7.9, "iPhone 8, iPhone X", fontsize=14,color="white")
plt.text(10, 4.9, "iPhone 7, AirPods", fontsize=14,color="white")
plt.text(10, 1.9, "iPhone 6S,", fontsize=14,color="white")
plt.text(65, 1.9, "iPad Pro/Mini", fontsize=14,color="black")
plt.text(-90, 5.9, "Macbook Pro", fontsize=14,color="white");
plt.text(-90, 2.9, "iPhone SE, iPad Pro", fontsize=14,color="white");
plt.text(-50, 3.9, "Software", fontsize=14,color="white");
plt.text(-100, 12.9, "Apple TV+/Card/Arcade", fontsize=14,color="black");
plt.text(-105, 14.9, "iPhone 11, Apple Watch", fontsize=14,color="black");

## Does the sentiment of quarter earning announcements differ from product launches?

In [None]:
quarterly_earnings=pd.read_excel('./data/Earnings_Apple.xlsx')

#Remove the unnecessary rows in the quarterly_earnings
quarterly_earnings=quarterly_earnings.drop(index=[0,1,2,3,4,5,6,7])

#Change the format of the Earnings Date column
quarterly_earnings["Earnings Date"]=quarterly_earnings["Earnings Date"].map(lambda x: x.replace(", 12 AMEST", "").replace(",", ""))
quarterly_earnings["Earnings Date"]=quarterly_earnings["Earnings Date"].map(lambda x: dt.datetime.strptime(x, "%b %d %Y"))

#Clean surprise factor\n",
quarterly_earnings["Surprise(%)"] = quarterly_earnings["Surprise(%)"].map(lambda x: -1*x[1:] if x[0] == '-' else x[1:])

# Remove dates before 2020\n",
to_drop = []
for i, date in enumerate(quarterly_earnings["Earnings Date"]):
    if date.year > 2019:
        to_drop.append(i+8)
quarterly_earnings = quarterly_earnings.drop(index=to_drop,axis=0)

# Add sentiment columns\n"
for i in [-3,-2,-1,0,1,2,3]:
    quarterly_earnings[f"""sentiment_week{i}{'_pre' if i < 0 else '_post' if i > 0 else ''}"""] = quarterly_earnings.apply(lambda x: sentiment_query(x['Earnings Date'], i),axis=1)

In [None]:
a = quarterly_earnings[sentiment_columns].mean().tolist()
b = df_events[sentiment_columns].mean().tolist()

# plot data in grouped manner of bar type\n",
plt.subplots(figsize=(20,10))
x = np.arange(7)
width = 0.4
plt.bar(x-0.2, a, width,color='darkslategrey')
plt.bar(x+0.2, b, width,color='cornflowerblue')
plt.xticks(x,labels)
plt.legend(['Product launches', 'Quarterly earnings announced'])
plt.ylabel("Mean sentiment of quotes")
plt.title("Comparison of mean sentiment for product launches and earning announcements")


Looking at the plot above, it can be seen that the sentiment of quotes before and after product launches on average are very similar the sentiment of quotes before and after announcements of quarterly earnings. The quotes are, however, generally more positive in the weeks where quarterly earnings are announced than in the weeks where new products are launched. This can be explained by the fact that quarterly reports typically are described with more adjectives, whereas product launches may be more factual and thus neutral.

In [None]:

b = df_events[sentiment_columns].mean().tolist()

# set theme of plot
sns.set_theme(style='white', rc={'axes.spines.right': False, 'axes.spines.top': False}, font='Arial', font_scale=2)
# plot data in grouped manner of bar type\n",
plt.subplots(figsize=(20,10))
x = np.arange(7)
width = 0.5
#plt.bar(x-0.2, a, width,color='darkslategrey')
plt.bar(x, b, width,color='#800020')
plt.xticks(x,labels, rotation=30)
#plt.legend(['Product launches', 'Quarterly earnings announced'])
plt.ylabel("Mean sentiment of quotes")
#plt.title("Comparison of mean sentiment for product launches and earning announcements")

From the plot it is clear that the not only the attention, but also the sentiment towards Apple increases during an event week. In addition the plot indicates that the sentiment in the weeks after an event is more positive than the weeks prior. To investigate this we will perform a Mann Whitney U-test, a robust, non-parametric test which measures whether there exist a significant difference between to distributions. 

Source: [Towards Data Science](https://towardsdatascience.com/intro-to-data-science-part-3-data-analysis-71a566c3a8c3)

In [None]:
df_events.columns

In [None]:
# Make numpy representations of weeks prior to event, and posterior to event
pre_columns = ['sentiment_week-3_pre', 'sentiment_week-2_pre', 'sentiment_week-1_pre']
post_columns = ['sentiment_week3_post', 'sentiment_week2_post', 'sentiment_week1_post']

prior_weeks = np.mean(df_events[pre_columns], axis=1)
posterior_weeks = np.mean(df_events[post_columns], axis=1)
event_week = df_events["sentiment_week0"]

# Combine prior and posterior to represent all weeks except event week
prior_posterior = np.array([prior_weeks, posterior_weeks]).T

# Conduct test between weeks prior and posterior
test_result = mannwhitneyu(prior_weeks, posterior_weeks)

print(f'Sentiment for the three weeks prior to a launch:')
print(f'Mean: {prior_weeks.mean()}, Standard Deviation: {prior_weeks.std()}')
print('----------------------------------------------------------------------')
print(f'Sentiment for the three weeks posterior to a launch:')
print(f'Mean: {posterior_weeks.mean()}, Standard Deviation: {posterior_weeks.std()}')
print('----------------------------------------------------------------------')
print(f'Mann Whitney U-Test: {test_result}')

# Conduct test between event week and all other
event_rest = mannwhitneyu(prior_posterior.mean(axis=1), event_week)
print('----------------------------------------------------------------------')
print(f'Sentiment for event week:')
print(f'Mean: {posterior_weeks.mean()}, Standard Deviation: {posterior_weeks.std()}')
print('----------------------------------------------------------------------')
print(f'Mann Whitney U-Test between event week and all other: {event_rest}')

Mann Whitney U-test that the two distributions are equal. The p-value represents the percentages of times we would see a similar (or more extreme) result as our obeservations given this assumption. Since the p-value is ~0.3, by drawing two random samples from the same distribution would yield at least as big difference as observed. This difference is not significant.

### Does the sentiment/attention before and after a product launch correlate with changes in the stock price?

In [None]:
# Calculate sentiment change for each of the product launches
df_events['sentiment_change'] = df_events["sentiment_week1_post"] - df_events['sentiment_week-1_pre']
df_events['attention_change'] = df_events["attention_week1_post"] - df_events['attention_week-1_pre']


# Calculate the change in stock price before and after each stock price (average of the week before to the week after)
stock_data_events=pd.read_csv('./data/AAPL_2015_to_2020_yahoo_finance.csv')
stock_data_events.Date = pd.to_datetime(stock_data_events.Date) - pd.to_timedelta(7, unit='d')
stock_data_events = stock_data_events.groupby([pd.Grouper(key='Date', freq='W-MON')]).mean().reset_index().sort_values('Date')
stock_data_events['week'] = stock_data_events.apply(lambda x: x['Date'].week,axis=1)
stock_data_events['year'] = stock_data_events.apply(lambda x: x['Date'].year,axis=1)

def stock_query(date):
    "Given date returns the difference between the stockprice the week before and after said date"
    year = date.isocalendar()[0]
    week = date.isocalendar()[1]
    stock_start = stock_data_events.loc[(stock_data_events.week == week - 1) & (stock_data_events.year == year)].Open.item()
    stock_end = stock_data_events.loc[(stock_data_events.week == week + 1) & (stock_data_events.year == year)].Close.item()

    diff = stock_start - stock_end
    return diff


df_events["stock_change"] = df_events.apply(lambda x: stock_query(x['event_date']),axis=1)

Let's see if there is any connection between the change in stock price and sentiment/attention

In [None]:
sns.set_theme(font_scale=1.5, style='whitegrid', font='Arial', rc={"axes.spines.right": False, "axes.spines.top": False})
fig, axes =plt.subplots(1,2,figsize=(20,10))
sns.set_theme(style='white', rc={'axes.spines.right': False, 'axes.spines.top': False}, font_scale=2)
sns.regplot(ax=axes[0], x=df_events["stock_change"], y=df_events["sentiment_change"])
axes[0].set_xlabel('Price difference of stock after product launch ($)', size=25)
axes[0].set_ylabel('Difference in sentiment score after product launch ', size=25)
sns.regplot(ax=axes[1],x=df_events["stock_change"], y=df_events["attention_change"],color='coral')
axes[1].set_xlabel('Price difference of stock after product launch ($)', size=25)
axes[1].set_ylabel('Difference in attention score after product launch ', size=25)

#fig.suptitle('Does changes in sentiment/attention after a product launch correlate with changes in stock prices?')

In [None]:
import statsmodels.api as sm
mod = sm.OLS(df_events["stock_change"],df_events["sentiment_change"])
res = mod.fit()
print(res.summary())

In [None]:
mod = sm.OLS(df_events["stock_change"],df_events["attention_change"])
res = mod.fit()
print(res.summary())

Both regression lines have very low R^2-values indicating a poor fit. To conclude, the change in stock price from the week before an event to the week after an event can neither be explained by the corresponding change in attention level nor the change in sentiment.