In [None]:
from pathlib import Path
import pandas as pd
import nltk
from nltk.corpus import stopwords
import gensim
from gensim.utils import simple_preprocess
from yellowbrick.cluster import KElbowVisualizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from random import choice
import scipy

BASEDIR = Path.cwd()

# Loads the raw database
df = pd.read_csv(BASEDIR / 'data.csv', index_col=0)

nltk.download('stopwords')
stop_words = stopwords.words('portuguese')

# preprocessing definitions
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

# preprocessing applications
df = df.dropna(subset=['inteiro_teor'])
df.inteiro_teor = df.inteiro_teor.apply(lambda x : x.lower())
df['processed_text'] = remove_stopwords(list(sent_to_words(df.inteiro_teor)))

bigram = gensim.models.Phrases(df.processed_text, min_count=5, threshold=100) # higher threshold fewer phrases.
bigram_mod = gensim.models.phrases.Phraser(bigram)

df.processed_text = make_bigrams(list(sent_to_words(df.processed_text)))
df.processed_text = df.processed_text.apply(lambda x : ' '.join(x))


vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df.processed_text)
X = scipy.sparse.csr_matrix.toarray(X)

model = KMeans()

# KElbowVisualizer helps finding the optimal number of clusters. The tuple k sets the minimal and maximium number to be tested.
visualizer = KElbowVisualizer(model, k=(10,40))
visualizer.fit(X)

# We create a new instance of the model, using the elbow value defined by the KElbowVisualizer as the number of clusters
model = KMeans(n_clusters=visualizer.elbow_value_ , init='k-means++', max_iter=300, n_init=10, random_state=42)

model.fit(X)
df['cluster_00'] = model.predict(X)
df.to_csv(BASEDIR/'cluster_00.csv')

# we create a sample of each cluster
df_sample = pd.concat([dfp.sample(min(12, dfp.shape[0])) for i, dfp in df.groupby('cluster_00')])
## The resampling randonmizes the dataframe
df_sample.sample(frac=1).reset_index(drop=True)
df_sample.to_csv(BASEDIR/'sample_00.csv')

# we assign random researchers to each decision
researchers = ['A', 'B', 'C', 'D', 'E']
df_sample['researcher'] = df_sample.cluster_00.apply(lambda x : choice(researchers))
df_sample.drop(['processed_text', 'arquivo', 'cluster_00'], axis=1).to_excel(BASEDIR/'classifications__00.excel')
