In [1]:
# this bit of code uses the os.walk method from Python's os module to generate a list 
# of all the .txt files in the 'txt' folder
# os.walk returns the root directory of a folder, a list of all subfolders, 
# and a list of all files in the directory, including all files in its subdirectories 
# I then loop through the list of files and use the endsith method to verify I'm finding only text files
# I then append each text file name to the list called all_txt_files
# Finally, I return the length of all_txt_files to verify that I've found 366 file names
# This loop-and-append approach is very common in Python. You might even call it Pythonic.

import os
all_txt_files =[]
for root, dirs, files in os.walk("txt"):
    for file in files:
        if file.endswith(".txt"):
            all_txt_files.append(os.path.join(root, file))
n_files = len(all_txt_files)
all_txt_files[365]

'txt/0426.txt'

In [2]:
# the sort method ensures that the list of files is ordered by file name
# I then return all_txt_files[0] to verify that txt/0101.txt is the first item in the list
all_txt_files.sort()
all_txt_files[0]

'txt/0101.txt'

In [3]:
# In this bit of code, I do another loop-and-append 
# this time, I loop my list of file names and open each file. 
# I then use the read method to convert each text file to a string,
# and I append each string, one by one, to a new list called all_docs.
# Crucially, the string objects in the all_docs list implicitly have the order
# as the file names in the all_txt_files list 

all_docs = []
for i in all_txt_files:
    with open(i) as f:
        txt = f.read()
    all_docs.append(txt)

In [4]:
#import the TfidfVectorizer from scikit-learn.  
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
vectorizer_counts = CountVectorizer()
X_counts = vectorizer_counts.fit_transform(all_docs)

# TfidfVectorizer is a class, so I instantiate it with specific pararmeters as 'vectorizer'
# I then run the object's fit_transform() method on my list of strings (all_docs)
# The stored variable X is output of the fit_transform() method 
vectorizer = TfidfVectorizer(max_df=.65, min_df=1, stop_words=None, use_idf=True, norm=None)
X = vectorizer.fit_transform(all_docs)

In [9]:
a_counts = X_counts.toarray()
# The fit_transform() method converts the list of strings to a sparse matrix of TF-IDF values
# The toarray method converts a numpy array, which makes it easier to indpect every values including the zeros 
myarray = X.toarray()

In [10]:
# this line of code verifies that the numpy array represents the same number of 
# documents that we have in the file list
len(myarray)

366

In [13]:
doc_0_feature_scores = list(zip(vectorizer.get_feature_names(), myarray[0]))

In [14]:
from sklearn.cluster import KMeans

In [18]:
kmeans = KMeans(n_clusters=4, random_state=0).fit(myarray)

In [19]:
kmeans.labels_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,