In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
from sklearn.feature_extraction import stop_words
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def get_corpus(parent_folder):
    """retourne corpus des textes contenus dans parent_folder sous forme de liste de string"""
    corpus = []
    # iterate over all the files in directory 'parent_folder'
    for file_name in os.listdir(parent_folder):
        if file_name.endswith(".txt"):
            path = parent_folder+"/"+file_name
            fichier = open(path, "r")
            lignes = fichier.readlines()
            fichier.close()
            
            texte = ""
            for ligne in lignes :
                if ligne[0] not in "0123456789":
                    texte += ligne
            corpus.append(texte)
        
        else:
            current_path = "".join((parent_folder, "/", file_name))
            if os.path.isdir(current_path):
                # if we're checking a sub-directory, recall this method
                scan_folder(current_path)
    
    return corpus

In [3]:
def somme_lignes(tab):
    """somme les lignes d'un tableau"""
    res = []
    for i in range(len(tab[0])):
        res.append(0)
    for k in range(len(tab)):
        for i in range(len(tab[0])):
                res[i] += tab[k][i]
    return np.array(res)

In [4]:
def df_n_plus_presents(n, corpus, stopwords_set):
    """dataFrame des n mots les plus present dans le corpus"""
    vectorizer = CountVectorizer(stop_words = stopwords_set)
    X = vectorizer.fit_transform(corpus)
    dico = vectorizer.get_feature_names()
    nb_occ = somme_lignes(X.toarray())
    
    ind = np.argpartition(nb_occ, -n)[-n:]
    ind = ind[np.argsort(-nb_occ[ind])]
    words = [dico[i] for i in ind]

    words_count = []
    i = 0
    for i in range(len(words)):
        words_count.append(nb_occ[ind[i]])

    df = pd.DataFrame(np.column_stack([words, words_count]), columns=['Word', 'Nb_occ'])
    df.Nb_occ=pd.to_numeric(df.Nb_occ)
    
    return df   

In [5]:
def get_hist(df, x_axis, y_axis, titre, colour, font_size=None, horizontal=False):
    if horizontal:
        hist = df.plot.barh(x=x_axis, y=y_axis, color=colour, title =titre, fontsize = font_size, edgecolor = "none").get_figure()
    else:
        hist = df.plot.bar(x=x_axis, y=y_axis, color=colour, title =titre, fontsize = font_size, edgecolor = "none").get_figure()
    path_fig = "img/"+titre+'.png'
    hist.savefig(path_fig,  bbox_inches="tight")

In [6]:
#definition de l'ensemble de stopwords
nltk_sw = set(stopwords.words('english'))
sklearn_sw = set(stop_words.ENGLISH_STOP_WORDS)
stopwords_set = nltk_sw | sklearn_sw
l_nb = [str(i) for i in range(1000000)]
l_mots = ["don", "yeah", "hey", "okay", "oh", "uh", "yes", "ok"]
for mot in l_mots :
    stopwords_set.add(mot)
for nb in l_nb:
    stopwords_set.add(nb)

In [7]:
n = 50
corpus = get_corpus("data/1___Lost/01")
df_count = df_n_plus_presents(n, corpus, stopwords_set)
titre = "les "+str(n)+" mots les plus presents dans la premiere saison de Lost"
get_hist(df_count, "Word", "Nb_occ", titre, "teal", 7)
df_count

FileNotFoundError: [Errno 2] No such file or directory: 'data/1___Lost/01'

In [None]:
vectorizer = TfidfVectorizer(stop_words = stopwords_set)
X = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names()
dico = vectorizer.get_feature_names()
dense = X.todense()
denselist = dense.tolist()
df_tfidf = pd.DataFrame(denselist, columns=feature_names)
df_tfidf.loc[:,'gonna':].head()

In [None]:
nlargest = 20
order = np.argsort(-df_tfidf.values, axis=1)[:, :nlargest]
result = pd.DataFrame(df_tfidf.columns[order], 
                      columns=['top{}'.format(i) for i in range(1, nlargest+1)],
                      index=df_tfidf.index)
result.head()

In [None]:
nbep = 1
for i in range(nbep):
    lig_df = df_tfidf[i:i+1]
    lig_res = result[i:i+1]

    mots = list(np.array(lig_res)[0])
    values = [float(lig_df[mot]) for mot in mots]
    df = pd.DataFrame(np.column_stack([mots, values]), columns=['Word', 'Tfidf'])
    df.Tfidf = pd.to_numeric(df.Tfidf)
    df.sort_values(by ='Tfidf', inplace = True, ascending=True)
    titre = "top "+str(nlargest)+" tf-idf scores for Lost season 1 episode "+str(i+1)
    get_hist(df, "Word", "Tfidf", titre, "limegreen", horizontal=True)