In [25]:
import pandas as pd
import numpy as np
import bz2
import json
import requests
import findspark
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords

from pyspark.sql import SparkSession
import pyspark.sql.functions as sf
from datetime import datetime
import sys
import urllib.request as r
import urllib.request
import re
from bs4 import BeautifulSoup
from dateutil.parser import parse
from urllib.request import urlopen, Request
import csv

In [26]:
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

Ideas of events for which we could have a good distribution. Eg, events in 2017:

Arrest of Alexei Navalny in 2017 


Bob Dylan recieving its nobel prize 


Election of Emmanuel Macron


"Wonder Woman" comes out, propulsing Gal Gadot's carrier 


Emperor Akihito abdicates


Commercialisation of the first mass market Tesla, the Model 3


O.J. Simpson release


Harvey Weinstein accused of sexual assault


Kevin Spacey appology for sexual assault

In [27]:
df = spark.read.json('data/quotes-2017.json.bz2')

KeyboardInterrupt: 

In [None]:
df.printSchema()

In [59]:
df_parasite = df.filter(df.quotation.contains('Tesla'))

parasite = df_parasite.toPandas()
parasite_words = parasite.iloc[:,5]

In [60]:
from nltk import pos_tag
from nltk.tokenize import word_tokenize

def token_stop_pos(text):
    
    tags = pos_tag(word_tokenize(text))
    newlist = []
    for word, tag in tags:
        
        if word.lower() not in set (stopwords.words('english')):
            newlist.append(tuple([word, pos_dict.get(tag[0])]))
            
    return newlist

In [61]:
# Define a function to clean the text
def clean(text):
# Removes all special characters and numericals leaving the alphabets
    text = re.sub('[^A-Za-z]+', ' ', text)
    return text
parasite

Unnamed: 0,date,numOccurrences,phase,probas,qids,quotation,quoteID,speaker,urls
0,2017-08-18 15:47:25,2,E,"[[Henrik Fisker, 0.9207], [None, 0.0793]]",[Q438667],Tesla doesn't really have a competitor. It doe...,2017-08-18-103393,Henrik Fisker,[http://thestreet.com/story/14276384/1/henrik-...
1,2017-05-04 18:06:37,1,E,"[[Elon Musk, 0.6801], [None, 0.3199]]",[Q317521],It will be the kind of thing where you hope th...,2017-05-04-071108,Elon Musk,[http://www.thedrive.com/new-cars/9971/tesla-t...
2,2017-03-11 22:16:22,2,E,"[[None, 0.734], [Jason Isaac, 0.266]]",[],No other vehicle manufacturer is seeking to ch...,2017-03-11-046540,,[http://myhighplains.com/news/tesla-tries-a-ne...
3,2017-07-22 19:04:21,1,E,"[[Matthias Müller, 0.5869], [Dieter Zetsche, 0...","[Q15834434, Q1747899, Q1910173, Q24257548, Q25...",Tesla is a successful electric automotive comp...,2017-07-22-061435,Matthias Müller,[http://insideevs.com/automaker-execs-tesla-is...
4,2017-03-06 10:52:11,3,E,"[[Jim Green, 0.6435], [None, 0.3565]]","[Q21585528, Q3506849, Q6195314]",In the future it is quite possible that an inf...,2017-03-06-037120,Jim Green,[http://www.business-standard.com/article/pti-...
...,...,...,...,...,...,...,...,...,...
3726,2017-11-02 00:56:49,1,E,"[[Joe White, 0.7683], [None, 0.2317]]","[Q49001913, Q6212966, Q6212968, Q6212969]",They are still playing catch up particularly i...,2017-11-02-134190,Joe White,[http://www.themalaymailonline.com/drive/artic...
3727,2017-08-03 19:06:35,2,E,"[[Brett Smith, 0.7814], [None, 0.2051], [Elon ...","[Q16235028, Q39070154, Q57472681, Q910861]",important inflexion point for both Tesla and t...,2017-08-03-067676,Brett Smith,[http://www.theworldweekly.com/reader/view/mag...
3728,2017-05-06 00:00:00,19,E,"[[None, 0.8717], [Elon Musk, 0.1283]]",[],The analogy I like to use is: that's a Tesla c...,2017-05-06-055538,,[http://www.investing.com/news/technology-news...
3729,2017-12-24 07:06:38,2,E,"[[Ryan Zinke, 0.7373], [None, 0.2628]]",[Q7384672],There are places for public private partnershi...,2017-12-24-046857,Ryan Zinke,[https://www.abqjournal.com/1110774/zinke-unda...


In [62]:
nltk.download('wordnet')
from nltk.corpus import wordnet
from nltk.tokenize import RegexpTokenizer


parasite['Cleaned Reviews'] = parasite['quotation'].apply(clean)

pos_dict = {'J':wordnet.ADJ, 'V':wordnet.VERB, 'N':wordnet.NOUN, 'R':wordnet.ADV}
#words = []
#tokenizer = RegexpTokenizer(r'\w+')

#for it in parasite_words.tolist():

#    words.extend(tokenizer.tokenize(it))

parasite['POS tagged'] = parasite['Cleaned Reviews'].apply(token_stop_pos)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Benjamin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [63]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def lemmatize(pos_data):
    lemma_rew = " "
    for word, pos in pos_data:
        if not pos:
            lemma = word
            lemma_rew = lemma_rew + " " + lemma
        else:
            lemma = wordnet_lemmatizer.lemmatize(word, pos=pos)
            lemma_rew = lemma_rew + " " + lemma
    return lemma_rew

parasite['Lemma'] = parasite['POS tagged'].apply(lemmatize)
parasite.head()

Unnamed: 0,date,numOccurrences,phase,probas,qids,quotation,quoteID,speaker,urls,Cleaned Reviews,POS tagged,Lemma
0,2017-08-18 15:47:25,2,E,"[[Henrik Fisker, 0.9207], [None, 0.0793]]",[Q438667],Tesla doesn't really have a competitor. It doe...,2017-08-18-103393,Henrik Fisker,[http://thestreet.com/story/14276384/1/henrik-...,Tesla doesn t really have a competitor It does...,"[(Tesla, n), (really, r), (competitor, n), (lo...",Tesla really competitor look like one go eme...
1,2017-05-04 18:06:37,1,E,"[[Elon Musk, 0.6801], [None, 0.3199]]",[Q317521],It will be the kind of thing where you hope th...,2017-05-04-071108,Elon Musk,[http://www.thedrive.com/new-cars/9971/tesla-t...,It will be the kind of thing where you hope th...,"[(kind, n), (thing, n), (hope, v), (service, n...",kind thing hope service take long time absol...
2,2017-03-11 22:16:22,2,E,"[[None, 0.734], [Jason Isaac, 0.266]]",[],No other vehicle manufacturer is seeking to ch...,2017-03-11-046540,,[http://myhighplains.com/news/tesla-tries-a-ne...,No other vehicle manufacturer is seeking to ch...,"[(vehicle, n), (manufacturer, n), (seeking, v)...",vehicle manufacturer seek change law Tesla n...
3,2017-07-22 19:04:21,1,E,"[[Matthias Müller, 0.5869], [Dieter Zetsche, 0...","[Q15834434, Q1747899, Q1910173, Q24257548, Q25...",Tesla is a successful electric automotive comp...,2017-07-22-061435,Matthias Müller,[http://insideevs.com/automaker-execs-tesla-is...,Tesla is a successful electric automotive comp...,"[(Tesla, n), (successful, a), (electric, a), (...",Tesla successful electric automotive company...
4,2017-03-06 10:52:11,3,E,"[[Jim Green, 0.6435], [None, 0.3565]]","[Q21585528, Q3506849, Q6195314]",In the future it is quite possible that an inf...,2017-03-06-037120,Jim Green,[http://www.business-standard.com/article/pti-...,In the future it is quite possible that an inf...,"[(future, n), (quite, r), (possible, a), (infl...",future quite possible inflatable structure g...


In [64]:
df_evolution = parasite[["speaker","quotation", "Cleaned Reviews", "POS tagged", "Lemma"]]
print(df_evolution["quotation"][1])
print(df_evolution["Cleaned Reviews"][1])
print(df_evolution["POS tagged"][1])
print(df_evolution["Lemma"][1])

It will be the kind of thing where you hope that service takes a long time because you have the absolute top-of-the-line Tesla as a service loaner,
It will be the kind of thing where you hope that service takes a long time because you have the absolute top of the line Tesla as a service loaner 
[('kind', 'n'), ('thing', 'n'), ('hope', 'v'), ('service', 'n'), ('takes', 'v'), ('long', 'a'), ('time', 'n'), ('absolute', 'a'), ('top', 'n'), ('line', 'n'), ('Tesla', 'n'), ('service', 'n'), ('loaner', 'n')]
  kind thing hope service take long time absolute top line Tesla service loaner


In [65]:
#fd = nltk.FreqDist(filtered_sentence)

In [66]:
#words_bigram = [w for w in words if w.isalpha()]
#finder = nltk.collocations.BigramCollocationFinder.from_words(words_bigram)
#finder.ngram_fd.most_common(2)

In [67]:
analyzer = SentimentIntensityAnalyzer()

# function to calculate vader sentiment
def vadersentimentanalysis(review):
    vs = analyzer.polarity_scores(review)
    return vs['compound']

df_evolution['Vader Sentiment'] = df_evolution['Lemma'].apply(vadersentimentanalysis)

# function to analyse
def vader_analysis(compound):
    if compound >= 0.5:
        return 'Positive'
    elif compound <= -0.5 :
        return 'Negative'
    else:
        return 'Neutral'

df_evolution['Vader Analysis'] = df_evolution['Vader Sentiment'].apply(vader_analysis)
df_evolution.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_evolution['Vader Sentiment'] = df_evolution['Lemma'].apply(vadersentimentanalysis)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_evolution['Vader Analysis'] = df_evolution['Vader Sentiment'].apply(vader_analysis)


Unnamed: 0,speaker,quotation,Cleaned Reviews,POS tagged,Lemma,Vader Sentiment,Vader Analysis
0,Henrik Fisker,Tesla doesn't really have a competitor. It doe...,Tesla doesn t really have a competitor It does...,"[(Tesla, n), (really, r), (competitor, n), (lo...",Tesla really competitor look like one go eme...,0.4144,Neutral
1,Elon Musk,It will be the kind of thing where you hope th...,It will be the kind of thing where you hope th...,"[(kind, n), (thing, n), (hope, v), (service, n...",kind thing hope service take long time absol...,0.7964,Positive
2,,No other vehicle manufacturer is seeking to ch...,No other vehicle manufacturer is seeking to ch...,"[(vehicle, n), (manufacturer, n), (seeking, v)...",vehicle manufacturer seek change law Tesla n...,0.0,Neutral
3,Matthias Müller,Tesla is a successful electric automotive comp...,Tesla is a successful electric automotive comp...,"[(Tesla, n), (successful, a), (electric, a), (...",Tesla successful electric automotive company...,0.6249,Positive
4,Jim Green,In the future it is quite possible that an inf...,In the future it is quite possible that an inf...,"[(future, n), (quite, r), (possible, a), (infl...",future quite possible inflatable structure g...,0.4019,Neutral


In [68]:
positive = df_evolution[df_evolution["Vader Analysis"] == "Positive"]
top_vals = positive[positive["Vader Sentiment"] > 0.8]
top_vals["quotation"][41]

'The success of Tesla is a good example that you can bring manufacturing back to this country, for sure,'

In [74]:
import text2emotion as te

emotions = []

sentences = df_evolution["Lemma"].tolist()
print(sentences)

for cur in sentences:
    
    emotions.append(te.get_emotion(cur))




In [75]:
emotions

[{'Happy': 0.0, 'Angry': 0.0, 'Surprise': 0.67, 'Sad': 0.33, 'Fear': 0.0},
 {'Happy': 0.14, 'Angry': 0.0, 'Surprise': 0.29, 'Sad': 0.0, 'Fear': 0.57},
 {'Happy': 0.0, 'Angry': 0.0, 'Surprise': 1.0, 'Sad': 0.0, 'Fear': 0.0},
 {'Happy': 0.0, 'Angry': 0.0, 'Surprise': 0.5, 'Sad': 0.25, 'Fear': 0.25},
 {'Happy': 0.0, 'Angry': 0.0, 'Surprise': 0.0, 'Sad': 0.0, 'Fear': 1.0},
 {'Happy': 0.4, 'Angry': 0.0, 'Surprise': 0.2, 'Sad': 0.2, 'Fear': 0.2},
 {'Happy': 0, 'Angry': 0, 'Surprise': 0, 'Sad': 0, 'Fear': 0},
 {'Happy': 0.0, 'Angry': 0.0, 'Surprise': 0.0, 'Sad': 0.4, 'Fear': 0.6},
 {'Happy': 0.17, 'Angry': 0.0, 'Surprise': 0.0, 'Sad': 0.33, 'Fear': 0.5},
 {'Happy': 0.17, 'Angry': 0.0, 'Surprise': 0.0, 'Sad': 0.17, 'Fear': 0.67},
 {'Happy': 0.0, 'Angry': 0.0, 'Surprise': 0.67, 'Sad': 0.0, 'Fear': 0.33},
 {'Happy': 1.0, 'Angry': 0.0, 'Surprise': 0.0, 'Sad': 0.0, 'Fear': 0.0},
 {'Happy': 0.0, 'Angry': 0.0, 'Surprise': 0.33, 'Sad': 0.33, 'Fear': 0.33},
 {'Happy': 0.08, 'Angry': 0.15, 'Surprise': 