## Import Packages

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import en_core_sci_lg
from tqdm import tqdm
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
import string
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from sklearn.manifold import TSNE
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import pickle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import SGDClassifier
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
from gensim.models.callbacks import CallbackAny2Vec

## Load Text

In [None]:
data_path = '/Users/codyotoole/Desktop/Research_Data/Results/DETM/gd/result_gd.csv'
data_df = pd.read_csv(data_path, dtype={
    'Date': str,
})


data_df = data_df.rename(columns={'Content': 'text'})

### Split the Documents into Paragraphs

In [None]:
for n in range(0, len(data_df['text'])):
    print('splitting by paragraphs...')
    
    span = 10
    docs = []
    
    splitted_doc = data_df['text'][n].split('. ')
    splitted_doc = [". ".join(splitted_doc[i:i+span]) for i in range(0, len(splitted_doc), span)]
    for ii in splitted_doc:
        docs.append(ii)
    data_df['text'][n] = docs

### Processing Texts

In [None]:
jobs = data_df['text'].apply(pd.Series).reset_index().melt(id_vars='index').dropna()[['index', 'value']].set_index('index')

data_df = pd.merge(
    pd.merge(
        jobs,
        data_df['Title'],
        left_index=True,
        right_index=True),
    data_df[['Date']],
    left_index=True,
    right_index=True).rename(columns={'value': 'text'})

data_df.reset_index(drop=True, inplace = True)

def get_breaks(content, length):
    data = ""
    words = content.split(' ')
    total_chars = 0

    # add break every length characters
    for i in range(len(words)):
        total_chars += len(words[i])
        if total_chars > length:
            data = data + "<br>" + words[i]
            total_chars = 0
        else:
            data = data + " " + words[i]
    return data

In [None]:
data_df['body_word_count'] = data_df['text'].apply(lambda x: len(x.strip().split()))  # word count in body
data_df['body_unique_words']=data_df['text'].apply(lambda x:len(set(str(x).split())))  # number of unique words in body
data_df.head()

In [None]:
#Stop words
stopwords = [
    'doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure', 
    'rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 
    'al.', 'Elsevier', 'PMC', 'CZI', 'www'
]
with open('/Users/codyotoole/Desktop/Research_Data/Lemma_Stop/Oxitec_Stop_List.txt', 'r') as f:
    stops = f.read().split('\n')    

In [None]:
punctuations = string.punctuation
parser = en_core_sci_lg.load(disable=["tagger", "ner"])
#parser.max_length = 7000000

In [None]:
def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    mytokens = [ word for word in mytokens if word not in stops and word not in punctuations ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens

In [None]:
tqdm.pandas()
data_df["processed_text"] = data_df["text"].progress_apply(spacy_tokenizer)

## Doc2Vec

In [None]:
words = []

for n in range(0, len(data_df)):
    w = data_df["processed_text"][n].split(' ')
    words.append(w)

In [None]:
class EpochLogger(CallbackAny2Vec):
    '''Callback to log information about training'''
    
    def __init__(self):
         self.epoch = 0
         
    def on_epoch_begin(self, model):
         print("Epoch #{} start".format(self.epoch))

    def on_epoch_end(self, model):
         print("Epoch #{} end".format(self.epoch))
         self.epoch += 1
         
epoch_logger = EpochLogger()  

In [None]:
sentences = [TaggedDocument(sentence, [i]) for i, sentence in enumerate(words)]
    
model = Doc2Vec(min_count=2, window=10, size=100, sample=1e-4, negative=5, workers=7)

model.build_vocab(sentences)  

model.train(sentences, epochs =10, total_examples=model.corpus_count, callbacks=[epoch_logger])

model.docvecs[0]
model.docvecs.vectors_docs
X = np.array(model.docvecs.vectors_docs, dtype='float') 

In [None]:
pca = PCA(n_components=0.95, random_state=42)
X_reduced= pca.fit_transform(X)
X_reduced.shape

### K-means clustering

In [None]:
# run kmeans with many different k
distortions = []
K = range(2, 20)
for k in K:
    k_means = KMeans(n_clusters=k, random_state=42).fit(X_reduced)
    k_means.fit(X_reduced)
    distortions.append(sum(np.min(cdist(X_reduced, k_means.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])
    #print('Found distortion for {} clusters'.format(k))

In [None]:
X_line = [K[0], K[-1]]
Y_line = [distortions[0], distortions[-1]]

In [None]:
# Plot the elbow
plt.plot(K, distortions, 'b-')
plt.plot(X_line, Y_line, 'r')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

In [None]:
k = 6
kmeans = KMeans(n_clusters=k, random_state=42)
y_pred = kmeans.fit_predict(X_reduced)
data_df['y'] = y_pred

#reduce dimensions
tsne = TSNE(verbose=1, perplexity=50, random_state=42)
X_embedded = tsne.fit_transform(X)

In [None]:
#label via k-means
# sns settings
sns.set(rc={'figure.figsize':(15,15)})

# colors
palette = sns.hls_palette(6, l=.4, s=.9)
sns.set(rc={'axes.facecolor':'white', 'figure.facecolor':'white'})
# plot
sns.scatterplot(X_embedded[:,0], X_embedded[:,1], hue=y_pred, legend='full', palette=palette)
plt.title('t-SNE with Kmeans Labels')

plt.show()

## Topic Model On The Clusters

In [None]:
vectorizers = []
    
for ii in range(0, 6):
    # Creating a vectorizer
    vectorizers.append(CountVectorizer(
        min_df=5, max_df=0.9, stop_words='english', lowercase=True, 
        token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}'))

vectorizers[0]

In [None]:
vectorized_data = []

for current_cluster, cvec in enumerate(vectorizers):
    try:
        vectorized_data.append(cvec.fit_transform(
            data_df.loc[data_df['y'] == current_cluster, 'processed_text']))
    except Exception as e:
        print("Not enough instances in cluster: " + str(current_cluster))
        vectorized_data.append(None)

len(vectorized_data)

In [None]:
# number of topics per cluster
NUM_TOPICS_PER_CLUSTER = 10

lda_models = []
for ii in range(0, 10):
    # Latent Dirichlet Allocation Model
    lda = LatentDirichletAllocation(n_components=NUM_TOPICS_PER_CLUSTER, max_iter=10, learning_method='online',verbose=False, random_state=42)
    lda_models.append(lda)
    
lda_models[0]

In [None]:
clusters_lda_data = []

for current_cluster, lda in enumerate(lda_models):
    # print("Current Cluster: " + str(current_cluster))
    
    if vectorized_data[current_cluster] != None:
        clusters_lda_data.append((lda.fit_transform(vectorized_data[current_cluster])))
        

In [None]:
# Functions for printing keywords for each topic
def selected_topics(model, vectorizer, top_n=5):
    current_words = []
    keywords = []
    
    for idx, topic in enumerate(model.components_):
        words = [(vectorizer.get_feature_names()[i], topic[i]) for i in topic.argsort()[:-top_n - 1:-1]]
        for word in words:
            if word[0] not in current_words:
                keywords.append(word)
                current_words.append(word[0])
                
    keywords.sort(key = lambda x: x[1])  
    keywords.reverse()
    return_values = []
    for ii in keywords:
        return_values.append(ii[0])
    return return_values

In [None]:
all_keywords = []
for current_vectorizer, lda in enumerate(lda_models):
    # print("Current Cluster: " + str(current_vectorizer))

    if vectorized_data[current_vectorizer] != None:
        all_keywords.append(selected_topics(lda, vectorizers[current_vectorizer]))

all_keywords[0]
len(all_keywords)

## It's good to save your stuff

In [None]:
#SAVE STUFF
f=open('topics.txt','w')

count = 0

for ii in all_keywords:

    if vectorized_data[count] != None:
        f.write(', '.join(ii) + "\n")
    else:
        f.write("Not enough instances to be determined. \n")
        f.write(', '.join(ii) + "\n")
    count += 1

f.close()

# save the final t-SNE
pickle.dump(X_embedded, open("X_embedded.p", "wb" ))

# save the labels generate with k-means(20)
pickle.dump(y_pred, open("y_pred.p", "wb" ))

## Continue Onward
Classify and test your k-means clusters

In [None]:
def classification_report(model_name, test, pred):
    from sklearn.metrics import precision_score, recall_score
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import f1_score
    
    print(model_name, ":\n")
    print("Accuracy Score: ", '{:,.3f}'.format(float(accuracy_score(test, pred)) * 100), "%")
    print("     Precision: ", '{:,.3f}'.format(float(precision_score(test, pred, average='macro')) * 100), "%")
    print("        Recall: ", '{:,.3f}'.format(float(recall_score(test, pred, average='macro')) * 100), "%")
    print("      F1 score: ", '{:,.3f}'.format(float(f1_score(test, pred, average='macro')) * 100), "%")


In [None]:
# test set size of 20% of the data and the random seed 42 <3
X_train, X_test, y_train, y_test = train_test_split(X.toarray(),y_pred, test_size=0.2, random_state=42)

print("X_train size:", len(X_train))
print("X_test size:", len(X_test), "\n")

In [None]:
# SGD instance
sgd_clf = SGDClassifier(max_iter=10000, tol=1e-3, random_state=42, n_jobs=4)
# train SGD
sgd_clf.fit(X_train, y_train)

# cross validation predictions
sgd_pred = cross_val_predict(sgd_clf, X_train, y_train, cv=3, n_jobs=4)

# print out the classification report
classification_report("Stochastic Gradient Descent Report (Training Set)", y_train, sgd_pred)

In [None]:
# cross validation predictions
sgd_pred = cross_val_predict(sgd_clf, X_test, y_test, cv=3, n_jobs=4)

# print out the classification report
classification_report("Stochastic Gradient Descent Report (Training Set)", y_test, sgd_pred)

In [None]:
sgd_cv_score = cross_val_score(sgd_clf, X.toarray(), y_pred, cv=10)
print("Mean cv Score - SGD: {:,.3f}".format(float(sgd_cv_score.mean()) * 100), "%")

## Visualize the Data

In [None]:
topic_path = 'topics.txt'
with open(topic_path) as f:
    topics = f.readlines()

In [None]:
# sns settings
sns.set(rc={'figure.figsize':(15,15)})

# colors
palette = sns.hls_palette(6, l=.4, s=.9)

In [None]:
sns.set(rc={'axes.facecolor':'white', 'figure.facecolor':'white'})
# plot
sns.scatterplot(X_embedded[:,0], X_embedded[:,1], hue=y_pred, legend='full', palette=palette)

plt.title('t-SNE with Kmeans Labels')

plt.text(-60, 20, 'Disease, Virus, Transmission', fontweight='bold', fontsize=15,
        bbox={'facecolor': palette[0], 'alpha': 0.8, 'pad': 10})
plt.text(-15, -20, 'CRISPR, Genetic Engineering', fontweight='bold', fontsize=15,
        bbox={'facecolor': palette[1], 'alpha': 0.8, 'pad': 10})
plt.text(-40, 35, 'Risk, Release, Public', fontweight='bold', fontsize=15,
        bbox={'facecolor': palette[2], 'alpha': 0.8, 'pad': 10})
plt.text(-37, 0, 'Pests/Insects, Vector-Borne', fontweight='bold', fontsize=15,
        bbox={'facecolor': palette[3], 'alpha': 0.8, 'pad': 10})
plt.text(-20, -67, 'Expression, Mutation, Genes', fontweight='bold', fontsize=15,
        bbox={'facecolor': palette[4], 'alpha': 0.5, 'pad': 10})
plt.text(20, 10, 'Drive, Edit, Transgenic', fontweight='bold', fontsize=15,
        bbox={'facecolor': palette[5], 'alpha': 0.8, 'pad': 10})

In [None]:
plt.savefig("final_cluster_tsne.png")