In [63]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import seaborn as sns
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

## Import Data

Lists need to be converted from strings

In [None]:
data = pd.read_csv("clean_data.csv",converters={"base_word_lengths": lambda x: x.strip("[]").split(", "), "neutral_relative_word_lengths": lambda x: x.strip("[]").split(", ")})
data.head()

Clean up empty cells

In [45]:
for index, row in data.iterrows():
    for i in row["base_word_lengths"]:
        if i == '':
            data.drop(index, inplace=True)

convert to numbers

In [46]:
for index, row in data.iterrows():
    data["base_word_lengths"][index] = [float(i) for i in data["base_word_lengths"][index]]

Create a per phrase rate of speech

In [47]:
data['ros'] = ""

In [48]:
for index, row in data.iterrows():
  data["ros"][index] = sum(data["base_word_lengths"][index])/len(data["base_word_lengths"][index])

In [23]:
data['ros'] = pd.to_numeric(data['ros'])

Speaker codes

In [49]:
data['speaker'] = ""

In [50]:
for index, row in data.iterrows():
  data["speaker"][index] = row["filename"].split("_")[0]

Intensity codes

In [51]:
data['intensity'] = ""

In [52]:
for index, row in data.iterrows():
  data["intensity"][index] = row["filename"].split("_")[-1]

## Intensity analysis

In [None]:
sns.boxplot(data=data, x='intensity', y='ros')

In [None]:
m_comp = pairwise_tukeyhsd(endog=data['ros'], groups=data['intensity'], alpha=0.01)
print(m_comp)

all signifigantly different from one another

## Script analysis

In [22]:
scripts = [
  "It's eleven o'clock",
  "That is exactly what happened",
  "I'm on my way to the meeting",
  "I wonder what this is about",
  "The airplane is almost full",
  "Maybe tomorrow it will be cold",
  "I would like a new alarm clock",
  "I think I have a doctor's appointment",
  "Don't forget a jacket",
  "I think I've seen this before",
  "The surface is slick",
  "We'll stop in a couple of minutes"
]

In [25]:
script1 = data[data["script"] == scripts[1]]
script2 = data[data["script"] == scripts[2]]
script3 = data[data["script"] == scripts[3]]
script4 = data[data["script"] == scripts[4]]
script5 = data[data["script"] == scripts[5]]
script6 = data[data["script"] == scripts[6]]
script7 = data[data["script"] == scripts[7]]
script8 = data[data["script"] == scripts[8]]
script9 = data[data["script"] == scripts[9]]
script10 = data[data["script"] == scripts[10]]
script11 = data[data["script"] == scripts[11]]
script0 = data[data["script"] == scripts[0]]

In [None]:
sns.boxplot(data=data, x='script', y='ros')

## Speaker analysis

All scripts

In [None]:
sns.boxplot(data=data, x='speaker', y='ros')

In [None]:
model = ols('ros ~ C(speaker)', data=data).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
anova_table

In [None]:
m_comp = pairwise_tukeyhsd(endog=data['ros'], groups=data['speaker'], alpha=0.01)
print(m_comp)

Look at one script

In [None]:
sns.boxplot(data=script3, x='speaker', y='ros')

## Emotion analysis

All scripts

In [None]:
sns.boxplot(data=data, x='emotion', y='ros')

In [None]:
av_ros = data.groupby("emotion")["ros"].mean()
print(av_ros)

In [None]:
model = ols('ros ~ C(emotion)', data=data).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
anova_table

In [None]:
m_comp = pairwise_tukeyhsd(endog=data['ros'], groups=data['emotion'], alpha=0.01)
print(m_comp)

Per script analysis

In [None]:
sns.boxplot(data=script1, x='emotion', y='ros')

In [None]:
sns.boxplot(data=script0, x='emotion', y='ros')

In [None]:
sns.boxplot(data=script2, x='emotion', y='ros')

In [None]:
sns.boxplot(data=script3, x='emotion', y='ros')

In [None]:
sns.boxplot(data=script4, x='emotion', y='ros')

In [None]:
sns.boxplot(data=script5, x='emotion', y='ros')

In [None]:
sns.boxplot(data=script6, x='emotion', y='ros')

In [None]:
sns.boxplot(data=script7, x='emotion', y='ros')

In [None]:
sns.boxplot(data=script8, x='emotion', y='ros')

In [None]:
sns.boxplot(data=script9, x='emotion', y='ros')

In [None]:
sns.boxplot(data=script10, x='emotion', y='ros')

In [None]:
sns.boxplot(data=script11, x='emotion', y='ros')

## Attempt clustering

Look for sub emotion clusters

In [76]:
Happy = data[data['emotion']=="H"]

ROS over phrase for relative to neutral word lengths

In [None]:
Happy['relative_ros'] = ""

In [None]:
for index, row in Happy.iterrows():
    for i in row["neutral_relative_word_lengths"]:
        if i == '':
            Happy.drop(index, inplace=True)

In [None]:
for index, row in Happy.iterrows():
    Happy["neutral_relative_word_lengths"][index] = [float(i) for i in Happy["neutral_relative_word_lengths"][index]]

In [None]:
for index, row in Happy.iterrows():
  Happy["relative_ros"][index] = sum(Happy["neutral_relative_word_lengths"][index])/len(Happy["neutral_relative_word_lengths"][index])

In [82]:
X = Happy[["ros", "relative_ros"]]

Look for optimal number of clusters

In [None]:
clusters = []

for k in range(1,20):
    km = KMeans(n_clusters=k).fit(X)
    clusters.append(km.inertia_) 
    
fig, ax = plt.subplots(figsize=(12, 8))
sns.lineplot(x=list(range(1, 20)), y=clusters, ax=ax)
ax.set_title('Searching for Elbow')
ax.set_xlabel('Clusters')
ax.set_ylabel('Inertia')

plt.show()

In [None]:
km = KMeans(n_clusters=2).fit(X)
X['Labels'] = km.labels_

In [None]:
plt.scatter(X['ros'], X['relative_ros'], c = X['Labels'])

No clear clusters, same for anger