In [1]:
import pandas as pd
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

In [2]:
import glob, os
import pandas as pd

In [3]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None
#pd.options.display.max_colwidth = None

In [4]:
file_contents = []

for file in glob.glob("computer_transcripts/*.txt"):

    #print(file.split('/')[-1])
    
    with open(file,"r") as f:
        text_content = f.read()
        
    file_contents.append((file, text_content))

In [5]:
df_transcripts = pd.DataFrame(file_contents)
df_transcripts.columns = ['file_name', 'text_content']

In [6]:
stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [7]:
# data cleaning (stop words, remove non-alpha text, lemmatize)
df_transcripts['no_stop'] = df_transcripts['text_content'].apply(lambda words: ' '.join(word.lower() for word in words.split() if word.lower() not in stop))
df_transcripts['alpha_text'] = df_transcripts.no_stop.str.replace("[^a-zA-Z]", ' ')
df_transcripts['alpha_text'] = df_transcripts['alpha_text'].str.replace(r'\b\w\b', '').str.replace(r'\s+', ' ')
df_transcripts['no_stop'] = df_transcripts['alpha_text'].apply(lambda words: ' '.join(word.lower() for word in words.split() if word.lower() not in stop))
df_transcripts['lemmatized_text'] = df_transcripts['no_stop'].apply(
    lambda words: ' '.join(lemmatizer.lemmatize(w) for w in words.split()))

  df_transcripts['alpha_text'] = df_transcripts.no_stop.str.replace("[^a-zA-Z]", ' ')
  df_transcripts['alpha_text'] = df_transcripts['alpha_text'].str.replace(r'\b\w\b', '').str.replace(r'\s+', ' ')


In [8]:
#df_transcripts

In [9]:
tfv = TfidfVectorizer(ngram_range = (1,1))

In [10]:
vec_text = tfv.fit_transform(df_transcripts['lemmatized_text'])

In [11]:
df_transcripts.columns

Index(['file_name', 'text_content', 'no_stop', 'alpha_text',
       'lemmatized_text'],
      dtype='object')

In [12]:
words = tfv.get_feature_names()

In [13]:
# bag of words
#words[:10]

In [14]:
# setting up kmeans with 2 clusters, since we selected either advertising or legal
kmeans = KMeans(n_clusters = 2)
kmeans.fit(vec_text)
cluster_words = kmeans.cluster_centers_
df_cluster_words = pd.DataFrame(cluster_words, columns=words).T

In [15]:
# most common words by cluster
for i in range(0, 2):
    print('Cluster', i)
    print(df_cluster_words.sort_values(i, ascending=False)[i].head(20))
    print('\n')

Cluster 0
filter         0.188225
coupon         0.153383
taste          0.116241
raleigh        0.097545
viceroy        0.090958
gift           0.087537
cigarette      0.074398
flavor         0.069083
gold           0.068982
independent    0.067696
extra          0.057755
cool           0.055487
smoke          0.053480
better         0.051250
fresh          0.051247
time           0.049919
right          0.049723
king           0.045788
never          0.045233
bel            0.044509
Name: 0, dtype: float64


Cluster 1
tobacco      0.132123
would        0.126887
think        0.119366
question     0.104544
morris       0.082012
philip       0.081113
cigarette    0.079573
mr           0.078083
company      0.076393
case         0.076276
nicotine     0.071952
product      0.067135
one          0.065102
going        0.063498
know         0.061543
people       0.057811
well         0.057203
industry     0.056691
year         0.055635
time         0.055605
Name: 1, dtype: float64




In [16]:
# predict a category for each document in the training set.
# would be interesting to see how closely these match our categories of legal or advertising
df_transcripts['pred'] = kmeans.predict(tfv.transform(df_transcripts['lemmatized_text']))

In [17]:
df_transcripts[['file_name', 'lemmatized_text', 'pred']]

Unnamed: 0,file_name,lemmatized_text,pred
0,computer_transcripts/tobacco_qjb77c00.txt,good afternoon everybody thanks coming short n...,1
1,computer_transcripts/tobacco_kpr91e00.txt,living people think take independent action br...,0
2,computer_transcripts/tobacco_qyq95i00.txt,going back record start take number two deposi...,1
3,computer_transcripts/tobacco_xpu03f00.txt,hardly folk never smoked raleigh cigarette wou...,0
4,computer_transcripts/tobacco_hno23e00.txt,good morning thank coming press conference nam...,1
5,computer_transcripts/tobacco_qdo23e00.txt,today cigarette cannot answer today smoking sa...,0
6,computer_transcripts/tobacco_gav28d00.txt,hello welcome mark firestone vice president as...,1
7,computer_transcripts/tobacco_qar62a00.txt,name alberta think okay catch pas world end li...,0
8,computer_transcripts/tobacco_mnjp0149.txt,jonathan representing plane new york state act...,1
9,computer_transcripts/tobacco_lpp06a00.txt,quota remained correct follow line questioning...,1


In [18]:
from pandasql import sqldf 
pysqldf = lambda q: sqldf(q, globals())

In [19]:
# verifying records are evenly split between categories
pysqldf("""
SELECT 
    pred, COUNT(pred)
FROM
    df_transcripts
GROUP BY pred
""")

Unnamed: 0,pred,COUNT(pred)
0,0,17
1,1,18
