In [2]:
import os
import sys
import argparse
import numpy as np
import pandas as pd
from glob import glob
from pathlib import Path
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords
from collections import Counter
from sklearn.decomposition import TruncatedSVD
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle



basepath = Path("enron_sample/")


# getting the documents

def get_files(samples):
    dirs_in_basepath = (entry for entry in samples.iterdir() if entry.is_dir())
    authors = []
    sample_list=[]
    df = pd.DataFrame()
    my_dict={}
    for directory in dirs_in_basepath:
        subdirs =  (e for e in directory.iterdir() if directory.is_dir())
        stop_words = set(stopwords.words('english'))
        for filename in subdirs:
            authors.append(directory)
            with open(filename, "r") as thefile:
                thefile = list(thefile)
                thefile = ' '.join(thefile)
                thefile = word_tokenize(thefile)
                thefile = [''.join(c.lower() for c in s if c!="") for s in thefile if s not in string.punctuation if s.isalpha() is True]
                f_sample = [w for w in thefile if not w in stop_words]
                thefile = ' '.join(f_sample)
                sample_list.append(thefile)
                
    return sample_list, authors


# print("Constructing table with {} feature dimensions and {}% test instances...".format(args.dims, args.testsize))    

def extract_features(samples):
    sample_list = get_files(samples)[0]
    tfidf = TfidfVectorizer()
    feature_matrix = tfidf.fit_transform(sample_list)

    return feature_matrix.toarray()
        

X1 = extract_features(basepath)
# print(X1.shape)   

def reduce_dim(X, n=2):
    svd = TruncatedSVD(n)
    transformed = svd.fit_transform(X)
    return transformed
   
X2 = reduce_dim(X1, 1914)

def shuffle_split(X):
    df = pd.DataFrame(X)
    le = preprocessing.LabelEncoder()
    df.insert(0,'Authors', pd.Series(get_files(basepath)[1]).values)
    df['Authors'] = le.fit_transform(df['Authors'])
    
    data_labels = []
    cols = [col for col in df.columns if col not in ["Authors"]]
    data = df[cols]
    target = df['Authors']
    data, target = shuffle(data, target)

    data_train, data_test, target_train, target_test = train_test_split(data, target, test_size=0.20, random_state=40002)
    
    
print(shuffle_split(X2))


    

      Authors         0         1         2         3         4         5  \
0           5  0.063486  0.080597  0.026619  0.041771  0.022132  0.028081   
1           5  0.072900  0.109963  0.031539  0.054478  0.017827  0.004438   
2           5  0.071860  0.042823  0.026909  0.021559  0.025429  0.016004   
3           5  0.051056  0.042071  0.022045  0.018743  0.027726  0.028226   
4           5  0.022799  0.015421  0.006993  0.006365  0.003594  0.018395   
...       ...       ...       ...       ...       ...       ...       ...   
2924        0  0.053684  0.070428  0.022339  0.035751  0.010370  0.020385   
2925        0  0.088106  0.051381  0.012965  0.022343  0.020954  0.024338   
2926        0  0.044560  0.054701  0.015818  0.018855  0.034728  0.020092   
2927        0  0.059025  0.066132  0.016182  0.024144  0.019784  0.016623   
2928        0  0.059382  0.046986  0.021204  0.023848  0.015783  0.018223   

             6         7         8  ...      1904      1905      1906  \
0 