In [None]:
import sys
import pandas as pd
import numpy as np
import re
import collections
import pickle
import string
import dateutil.parser as parser
from datetime import datetime
from matplotlib import pyplot as plt
%matplotlib inline

import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
sys.path.append("/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages")

In [None]:
from textblob import TextBlob
import gensim
from gensim import corpora, models, similarities, matutils
from gensim.summarization import summarize

In [None]:
df = pd.read_csv('TED_Talks_by_ID_plus-transcripts-and-LIWC-and-MFT-plus-views.csv')

In [None]:
len(df)

# Preprocessing and summarization

In [None]:
# Count words after stripping times and pauses
df['text'] = df.transcript.map(lambda x: re.sub(r'[0-9]+:[0-9]+', '', str(x)))

stopwords = ['(Laughter)', '(Applause)']
for i in stopwords:
        df['text'] = df.text.map(lambda x: x.replace(i, ''))
        
df['words_n'] = df.text.map(lambda x: len(x.split()))

In [None]:
# Exclude those with no text
df = df[df['words_n'] > 1].reset_index(drop=True)

In [None]:
# Create 2 'clean' versions - 'transcript' preserves laughter/applause
df['text_clean'] = df.text.map(lambda x: re.sub(r'[\r]+', ' ', x)).map(lambda x: ' '.join(x.split()))
df['transcript_clean'] = df.transcript.map(lambda x: re.sub(r'[\r]+', ' ', str(x))).map(lambda x: ' '.join(x.split()))

In [None]:
df.text_clean[0][:400]

In [None]:
df['summary'] = None

for i in range(len(df)):
    try:
        df['summary'][i] = summarize(df.text_clean[i], ratio = 0.05, word_count=None, split=False)
    except:
        df['summary'][i] = 'ERROR'

df['summary'] = df.summary.map(lambda x: re.sub(r'[\n]+', ' ', x))
df = df[df.summary != 'ERROR' ].reset_index(drop=True)

In [None]:
df.head()

# Feature engineering

New variables to create:  
* Categories of the most common topics
* Log-transform views
* Length (time)
* Speed of talking
* Laughter (n, rate)
* Applause (n)
* Questions (n)
* Stories (n)
* Exclamation (n)
* Year and season
* References to people
* Filler words - e.g., 'so', 'um'
* % of nouns, verbs, adjectives  

In [None]:
# Create categories of tags - exclude TED tags
df['tags_list'] = df.tags.map(lambda x: re.findall(r"[\w']+", x))

master = []
for i in range(len(df)):
    master.extend(df.tags_list[i])

ignore = ['TED', 'TEDx']
tags = collections.Counter(x for x in master if x not in ignore)

In [None]:
# Limit to most common 20
tags_common = [x[0] for x in tags.most_common(20)]
tags_common

In [None]:
for i in tags_common:
    df[i] = df['tags_list'].apply(lambda x: 1 if i in str(x) else 0)

In [None]:
# Log-transform views
df['views'] = df['views_as_of_06162017']
df['log_views'] = df.views.map(lambda x: np.log(x))

In [None]:
# Specific components
df['laughter_n'] = df.transcript_clean.apply(lambda x: str(x).count('(Laughter)'))
df['applause_n'] = df.transcript_clean.apply(lambda x: str(x).count('(Applause)'))
df['questions_n'] = df.transcript_clean.apply(lambda x: str(x).count('?'))
df['stories_n'] = df.transcript_clean.apply(lambda x: str(x).count('story' or 'stories'))
df['exclamation_n'] = df.transcript_clean.apply(lambda x: str(x).count('!'))

In [None]:
# Convert duration to float
df['temp'] = df.duration.map(lambda x: x.split(':'))
df['time'] = df.temp.map(lambda x: int(x[0])*60 + int(x[1]) + int(x[2])/60)
del df['temp']

# Rate of talking and laughter
df['talking_speed'] = df.words_n / df.time
df['laughter_speed'] = df.laughter_n / df.time

In [None]:
# Sentences - n and length
df['sentences_n'] = df.text_clean.map(lambda x: len(sent_tokenize(x)))
df['sentence_length'] = df.words_n / df.sentences_n

In [None]:
# Year and season
df['year'] = df.date_published.map(lambda x: parser.parse(x).year)
df['month'] = df.date_published.map(lambda x: parser.parse(x).month)

seasons = {"season": {1: 'Winter', 2: 'Winter', 3: 'Spring',
                     4: 'Spring', 5: 'Spring', 6: 'Summer', 7: 'Summer', 
                     8: 'Summer', 9: 'Fall', 10: 'Fall', 
                     11: 'Fall', 12: 'Winter' }}

df['season'] = df.month
df.replace(seasons, inplace=True)

seasons = pd.get_dummies(df['season'])
df = pd.concat([df, seasons], axis=1)

In [None]:
# References to people
df['he'] = df.transcript_clean.apply(lambda x: str(x).lower().count(' he '))
df['she'] = df.transcript_clean.apply(lambda x: str(x).lower().count(' she '))
df['he_she'] = df.he + df.she
df['self'] = df.transcript_clean.apply(lambda x: str(x).count(' I '))
df['we'] = df.transcript_clean.apply(lambda x: str(x).lower().count(' we '))
df['you'] = df.transcript_clean.apply(lambda x: str(x).lower().count(' you '))

In [None]:
df['filler'] = df.transcript_clean.apply(lambda x: sum(str(x).lower().count(i) for i in (" um ", " uh ", " so ")))

In [None]:
df.filler.mean()

In [None]:
# Parts of speech
from collections import Counter
df['counts'] = None

for i in range(len(df)):
    words = word_tokenize(df.transcript[i])
    df['counts'][i] = Counter(tag for word,tag in pos_tag(words))

In [None]:
adjectives = ['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']
nouns = ['NN', 'NNP', 'NNPS', 'NNS']
verbs = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

df['adjectives'] = df.counts.apply(lambda x: sum((x[a] for a in adjectives)))
df['verbs'] = df.counts.apply(lambda x: sum((x[a] for a in verbs)))
df['nouns'] = df.counts.apply(lambda x: sum((x[a] for a in nouns)))

In [None]:
for i in ['adjectives', 'verbs', 'nouns']:
    name = i + '_percent'
    df[name] = df[i] / df.words_n

In [None]:
df.head()

# Functions to use for mapping sentiment change

In [None]:
# Generate a moving window to map sentiment
def window(seq, overlap):  
    for pos in range(0, len(seq), 1):
        yield seq[pos : pos + overlap]

def merge(seq, slide):
    for pos in range(0, len(seq), slide):
        yield seq[pos : pos + slide] 

In [None]:
# Graph sentiment over time using a sliding window of text
def storyarc(i, overlap, slide):
    global d 
    d = {} 

    delim = " "
    words = [s for s in df.text[i].split()] 
    merged_words = [' '.join(w) for w in merge(words, slide)]

    delim = " "
    samples = [delim.join(s) for s in window(merged_words, overlap)] 
    d['samples'] = samples

    # Score sentiment

    sentiments = [TextBlob(x).sentiment.polarity for x in samples]
    d['scores'] = sentiments
    
    test.append(sentiments)
    df['sentiment_array'][i] = np.asarray(sentiments)
    df['sentiment_array_n'][i] = len(sentiments)

In [None]:
# Graph sentiment shape for a particular speaker
def sentiment_graph(i, degrees):
    sentiments = df.sentiment_array_interp[i]
    x = range(len(sentiments))
    y = sentiments
    z = np.polyfit(x, y, degrees)
    y_new = np.polyval(z,x)

    axes = plt.gca()
    axes.set_ylim([-1,1])
    
    plt.suptitle('Story Arc', fontsize=16, fontname = "Helvetica")
    plt.xlabel('Text window', fontsize=14, fontname = "Helvetica")
    plt.ylabel('Sentiment', fontsize=14, fontname = "Helvetica")
    plt.plot(x,y,'b-')
    plt.plot(x,y_new,'r-')

# Sentiment analysis - overall and over time

In [None]:
# Sentiment - overall
df['polarity'] = df.text_clean.map(lambda x: TextBlob(x).sentiment.polarity)
df['subjectivity'] = df.text_clean.map(lambda x: TextBlob(x).sentiment.subjectivity)

**Array of sentiment**

In [None]:
test = []

df['sentiment_array'] = None
df['sentiment_array_n'] = None

In [None]:
for i in range(len(df)):
    storyarc(i, 2, 25)

In [None]:
df['min_sentiment'] = df.sentiment_array.map(lambda x: min(x))
df['max_sentiment'] = df.sentiment_array.map(lambda x: max(x))
df['sentiment_range'] = df.max_sentiment - df.min_sentiment 
df['sentiment_std'] = df.sentiment_array.map(lambda x: np.std(x))

In [None]:
df.head()

# Clustering of sentiment shape

Use k-Shape algorithm to identify shape patterns in sentiment time series generated above. Algorithm will be used to identify "story arc" clusters.

More information on k-Shape is available here: http://www.cs.columbia.edu/~jopa/kshape.html.

In [None]:
sys.path.append("/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages")
from kshape.core import kshape, zscore

In [None]:
med_array = df.sentiment_array_n.median()
print(med_array)

In [None]:
# Compress and stretch arrays to equal same size
import scipy.interpolate as interp

df['sentiment_array_interp'] = None

for i in range(len(df)):
    try:
        if df['sentiment_array_n'][i] <= med_array:
            arr_interp = interp.interp1d(np.arange(df['sentiment_array'][i].size), df['sentiment_array'][i])
            arr_stretch = arr_interp(np.linspace(0, df['sentiment_array'][i].size-1, 83))
            df['sentiment_array_interp'][i] = arr_stretch
        elif df['sentiment_array_n'][i] > med_array:
            arr_interp = interp.interp1d(np.arange(df['sentiment_array'][i].size), df['sentiment_array'][i])
            arr_compress = arr_interp(np.linspace(0, df['sentiment_array'][i].size-1, 83))
            df['sentiment_array_interp'][i] = arr_compress
    except:
        df['sentiment_array_interp'][i] = 'ERROR'

In [None]:
df = df[df.sentiment_array_interp != 'ERROR'].reset_index(drop=True)

In [None]:
df.head()

In [None]:
arrays = []
for i in range(len(df)):
    arrays.append(df.sentiment_array_interp[i])

In [None]:
cluster_num = 6
clusters = kshape(zscore(arrays, axis=1), cluster_num)
for i in range(cluster_num):
    print(len(clusters[i][1]))

In [None]:
clusters[0][1][120:130]

In [None]:
sentiment_graph(0, 3)

In [None]:
cluster_groups = [x[1] for x in clusters]

In [None]:
df['index'] = df.index

for i in range(len(cluster_groups)):
    df['cluster_num' + str(i)] = df['index'].apply(lambda x: 1 if x in cluster_groups[i] else 0)

In [None]:
df.head()

In [None]:
# EXPORT DATA
file = 'df_text_122417'
fileobj = open(file,'wb') 
pickle.dump(df,fileobj) 