In [12]:
import os
import sys
import argparse
import numpy as np
import pandas as pd
# Whatever other imports you need
import csv
from glob import glob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

In [13]:
def data_load(inputdir):
    folders = glob("{}/*".format(inputdir))
    corpus = []
    authors= []
    for author_folder in folders:
        author_specific_emails = glob("{}/*".format(author_folder))
        author = author_folder[13:] #zous lösning
        for email_path in author_specific_emails:
            authors.append(author)
            email_content = ""
            with open(email_path, "r") as email:
                for line in email:
                    email_content += line
            corpus += [email_content.lower()] #TODO: eventuellt ta bort lower() om det inte funkar
    return corpus, authors


In [3]:
#data_load("enron_sample")

In [14]:
corpus, authors = data_load("enron_sample")

In [30]:
def vectorize(corpus):
    stop = set(stopwords.words('english'))
    vectorizer = TfidfVectorizer(stop_words=stop, token_pattern=r'(?u)\b[A-Za-z][A-Za-z]+\b')
    vectorized_corpus = vectorizer.fit_transform(corpus)
    print(type(vectorized_corpus))
    return vectorized_corpus

In [31]:
vectorized_corpus = vectorize(corpus)


<class 'scipy.sparse.csr.csr_matrix'>


In [17]:
def reduce_dims(vectorized_corpus, dims):
    # reduce dimensions
    svd = TruncatedSVD(n_components=dims)
    reduced_vectorized_corpus = svd.fit_transform(vectorized_corpus)
    return reduced_vectorized_corpus

In [29]:
reduced_vectorized_corpus = reduce_dims(vectorized_corpus, 300)
print(type(reduced_vectorized_corpus))
print(reduced_vectorized_corpus[0])

<class 'numpy.ndarray'>
[ 0.0895656   0.05990686 -0.03340817  0.00726535  0.00136965  0.0330733
  0.01432221  0.01927053  0.03659325 -0.02949387 -0.02300034  0.02336136
  0.05732967  0.05476647 -0.00257795 -0.03518263  0.01635245 -0.07196698
  0.03977565  0.01409198 -0.01931466  0.00095158 -0.00670579  0.03146775
  0.0282375   0.07592153  0.07679435  0.05502963 -0.0327906   0.01696947
 -0.0262068   0.00726342 -0.00777126  0.00602958  0.00177497  0.0140207
  0.01178115 -0.00536675  0.02174938 -0.02689821  0.00859237  0.02372513
 -0.01350589 -0.06950285  0.038991    0.17398888 -0.03968237 -0.10780808
  0.14103779  0.06836757  0.08705208 -0.07662448 -0.07036098  0.02971381
  0.04666489 -0.1167457   0.00637345 -0.08368304 -0.02552255 -0.05888636
 -0.11847793  0.08015551  0.10837148 -0.09768247  0.0955591  -0.09882364
  0.02438561 -0.00300661 -0.0009404  -0.06906024  0.01187666 -0.00775418
 -0.05715216 -0.00201807 -0.04537133 -0.06983308  0.04458669  0.02175966
  0.05322888  0.05501134 -0.0

In [27]:
def zip_vectors_authors(reduced_vectorized_corpus, authors):
    vectors_labels_zipped = zip(authors, reduced_vectorized_corpus)
    vectors_labels_list = list(vectors_labels_zipped)
    return vectors_labels_list

In [28]:
vectors_labels_list = zip_vectors_authors(authors, reduced_vectorized_corpus)
print(vectors_labels_list[0])

(array([ 0.0895656 ,  0.05990686, -0.03340817,  0.00726535,  0.00136965,
        0.0330733 ,  0.01432221,  0.01927053,  0.03659324, -0.02949386,
       -0.02300032,  0.0233614 ,  0.05732974,  0.05476653, -0.00257795,
       -0.03518258,  0.01635255, -0.07196713,  0.03977528,  0.01409226,
       -0.01931489,  0.00095201, -0.00670577,  0.03146845,  0.02823617,
        0.07591977,  0.07679578,  0.0550313 , -0.03279192,  0.01696902,
       -0.02620707,  0.00726211, -0.00777651,  0.00602426,  0.00177615,
        0.01402237,  0.01178236, -0.00536642,  0.02175425, -0.02689528,
        0.00858108,  0.02374079, -0.01350926, -0.0695013 ,  0.03897882,
        0.17399478, -0.03968178, -0.10780515,  0.1409989 ,  0.06836062,
        0.08706408, -0.07657138, -0.07038643,  0.02975028,  0.04661621,
       -0.11672098,  0.00640366, -0.08362389, -0.02553976, -0.05884938,
       -0.1185049 ,  0.08013689,  0.1083695 , -0.09760725,  0.09545307,
       -0.09896121,  0.02440192, -0.00306739, -0.00111026, -0.0

In [19]:
def create_df(reduced_vectorized_corpus, authors):
    df = pd.DataFrame(data=reduced_vectorized_corpus)
    df.insert(0, "Author", authors)
    return df

In [20]:
df = create_df(reduced_vectorized_corpus, authors)

In [11]:
def shuffle_split(X,y):
    X_train, X_test, y_train, y_test = train_test_split(reduced_vectorized_corpus, authors, test_size=0.20, random_state=42)
    print(X_train[0]) #X_test[0], y_train[0], y_test[0])

In [21]:
shuffle_split(reduced_vectorized_corpus, authors)

[ 2.40782108e-01  3.23819909e-01  3.12527554e-01 -7.11847067e-02
  5.59363056e-03 -2.41203269e-02 -9.13754747e-03 -1.01567549e-02
 -4.75859873e-02 -2.24069106e-02  2.61804955e-02 -4.35894157e-02
 -8.29971549e-03  2.71671008e-02 -3.86220499e-03  3.32789303e-02
 -3.11964344e-02 -5.31087201e-02  5.76779858e-02  1.24095815e-01
 -8.57628558e-02 -6.17018481e-02 -9.43679498e-02 -5.49997954e-02
  7.45927122e-02 -6.61253187e-02 -5.78021971e-02 -5.17772753e-02
  7.11206243e-02 -1.34750853e-02 -3.92311868e-02 -2.01451745e-01
 -9.71885315e-02  5.63066148e-03  3.20295034e-01  2.61454996e-01
 -4.99257061e-02  9.49157976e-03  3.74899103e-01  2.09752994e-01
  6.79580308e-03  1.22096532e-02  2.61867951e-01 -7.69171910e-02
 -5.50025437e-02 -6.92428129e-02 -1.12968182e-01  5.94588426e-02
  6.04646594e-02 -2.96409235e-02 -1.09187289e-01  4.54782026e-02
  4.11188610e-02 -3.92962953e-02 -1.03825073e-02 -7.22709271e-02
  4.02160784e-02 -1.81710462e-02  2.26226399e-03  1.76655153e-04
 -3.04153743e-02  3.15424