<h1>Data Summary</h1>
<p>Make some basic descriptive distributions of the data.</p>

In [2]:
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.tools as tls
init_notebook_mode(connected=True)

from stop_words import get_stop_words
from pyspark.sql import SparkSession
from datetime import datetime
from pyspark.sql.functions import udf
from pyspark.sql.types import TimestampType, BooleanType
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import sys, pywt
sys.path.append("./lib")
from tweet_tolkenizers import TweetTolkenizer
from clusterers import Clusterers

base_path = "/Users/josephgartner/Projects/nlpDay2017/"
path = base_path + "data/"

spk = SparkSession.builder.master("local").getOrCreate()
df = spk.read.json(path)

sc.addPyFile(base_path + "/lib/tweet_tolkenizers.py")

<h2>Get French Words, Tokenize</h2>

In [8]:
df = df.filter(df['lang']=='fr')
stop_words = set(get_stop_words('fr'))
make_dt = udf(lambda date_string: datetime.strptime(date_string, "%a %b %d %H:%M:%S +0000 %Y"), TimestampType())
df = df.withColumn("dt_created", make_dt(df['created_at']))

bc_stop = sc.broadcast(stop_words)
tt = TweetTolkenizer()
print(df.count())

136661


<h2>Find words with the term 'Macron', get timestamp</h2>

In [12]:
def has_token(text, term='macron', tt=tt, lang='fr'):
    try:
        tokens = set(tt.tokenize(text, lang))
        return term in tokens
    except:
        return False

uht = udf(lambda text: has_token(text), BooleanType())

In [9]:
df_mac = df.withColumn("has_term", uht(df['text']))
df_mac = df_mac.filter(df_mac["has_term"]==True)
df_mac.count()

4381

<h2>Get histograms of Macron Tweets and all Tweets</h2>

In [24]:
dta2 = [ x.dt_created for x in df.select("dt_created").collect()]
dta3 = [(y - datetime(2017, 4, 27, 0, 0 ,0)).seconds/60 for y in dta2 if y > datetime(2017, 4, 27, 6, 0, 0) and y < datetime(2017, 4, 28, 0, 0, 0)]

sig2 = [ x.dt_created for x in df_mac.select("dt_created").collect()]
sig3 = [(y - datetime(2017, 4, 27, 0, 0 ,0)).seconds/60 for y in sig2 if y > datetime(2017, 4, 27, 6, 0, 0) and y < datetime(2017, 4, 28, 0, 0, 0)]

In [37]:
scale = 20
n_bins = 24*int(60/scale)-int(360/scale)


ha, ea = np.histogram(dta3, bins=n_bins, range=(360,24*60))
hs, es = np.histogram(sig3, bins=n_bins, range=(360,24*60))

In [38]:
ha = np.array([x if x>0 else .001 for x in ha])

ratio = np.divide(hs, ha)

In [39]:
cA, cD = pywt.dwt(hs, 'db1')

In [40]:
d0 = go.Bar(
        name="Tweet Datetime Distribution",
        x=list([x*scale for x in range(n_bins)]),
        y=ratio,
        marker=dict(
            color='#990000'
        )
    )


d1 = go.Bar(
        name="Discrete Wavelet Transform",
        x=list([2*x*scale for x in range(int(n_bins/2))]),
        y=cD,
        marker=dict(
            color='#009900'
        )
    )

l0 = go.Layout(
    title='Tweets per {} minutes'.format(scale),
    xaxis=dict(
        title='Minute of the day',
        titlefont=dict(
            size=14,
            color='black'
        )
    )
)

fig = tls.make_subplots(rows=2, 
                        cols=1, 
                        shared_xaxes=True, 
                        subplot_titles=('Tweet Datetime Distribution', "Discrete Wavelet Transform")
                       )
fig.append_trace(d0, 1, 1)
fig.append_trace(d1, 2, 1)

iplot(fig, filename='dice_pmf')

This is the format of your plot grid:
[ (1,1) x1,y1 ]
[ (2,1) x1,y2 ]

