In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import transformers
from flask import Flask, flash, request, redirect, url_for, render_template 
from werkzeug.utils import secure_filename
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering

from nltk.corpus import stopwords
import nltk

In [None]:
#Download corpus of common stopwords
nltk.download('stopwords')
stops = set(stopwords.words("english"))


#Remove stopwords from text to create higher differences between text by removing common words that don't matter for our context
def remove_stopwords(sentence):
    words = str(sentence).split()
    filtered_sentence = [word for word in words if word.lower() not in stops]
    return ' '.join(filtered_sentence)

In [None]:
app = Flask(__name__)


#Set 500 MB filesize limit
app.config['MAX_CONTENT_LENGTH'] = 500 * 1000 * 1000
UPLOAD_FOLDER = '/path/to/the/uploads'
ALLOWED_EXTENSIONS = {'csv'}


def allowed_file(filename):
    return '.' in filename and \
           filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS

In [None]:
def read_file(file):
    # Load CSV file into a DataFrame
    topics = pd.read_csv(file)

    return topics

In [None]:
def prepare_data(topics):

    df = topics.copy()

    #Clean up Description columns, remove all the HTML jargon, along with any "&nbsp; ,&amp &gt, &amp;
    patterns = '|'.join([r'<[^<>]*>', r'&nbsp;', r'&gt;', r'&amp'])

    df['Description'] = df['Description'].str.replace(patterns, ' ', regex=True)

    #Remove stopwords using the function we set up above
    df['Description'] = df['Description'].apply(remove_stopwords)

    #Check if the description looks okay
    print(df['Description'][0])

    #Merge Title and Description columns since we're going to be running context analysis on them combined. 
    df['Description'] = df['Title'] + " " + "\n" + df['Description']

    #Remove Title since it is now merged with description
    df = df.drop(['Title'], axis=1)

    print(df['Description'])

    #Start prepping to feed text into model
    #Remove \ns from text, might influence results

    df['Description'] = df['Description'].str.replace('\n' , '')

    # Convert DataFrame to NumPy array so that model can interpret it
    data = df['Description'].to_numpy()

    return data

In [None]:
def run_model(data):
    
    #Load Pretrained Model
    model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

    #Calculate embeddings for cleaned data
    embeddings = model.encode(data)
    embeddings.shape

    return embeddings
    

In [None]:
def calculate_similarities(embeddings):
    
    #Calculating embeddings similarities
    similarities = model.similarity(embeddings, embeddings)
    print(similarities)

    #Convert the pytorch tensor to a numpy array

    matrix = similarities.numpy()

    similarity_matrix = pd.DataFrame(similarities)

    return similarity_matrix

In [None]:
def generate_cluster(topics, embeddings, num_clusters, cosine_sensitivity):

    #TODO: Play around with HDBSCAN

    #TODO: Try hierarchichal Clustering

    print("Num clusters is:")
    print(num_clusters)
    print("Sensitivity is:")
    print (cosine_sensitivity)
  

    # Perform agglomerative clustering
    clustering_model = AgglomerativeClustering(
        n_clusters= num_clusters, distance_threshold= cosine_sensitivity,
        metric = 'cosine', linkage='average'
    )  # , affinity='cosine', linkage='average', distance_threshold=0.4)
    clustering_model.fit(embeddings)
    cluster_assignment = clustering_model.labels_
    topics["n_cluster"] = cluster_assignment

    return topics

In [None]:
def save_output(topics):
    topics = topics.sort_values(by=['n_cluster'])
    
    topics.to_csv('cluster_output.csv')

In [None]:
@app.route('/', methods=['GET', 'POST'])
def index_page():
    if request.method == 'POST':
        # check if the post request has the file part
        if 'file' not in request.files:
            flash('No file part')
            return redirect(request.url)
        file = request.files['file']
        clustering_type = request.form.get("clusteringType")
        num_clusters = None
        cosine_sensitivity = None               
        if(clustering_type == "auto"):
            print("Entering if condition")
            cosine_sensitivity = float(request.form.get("sensitivity"))
            print("sensitivity is:")
            print(cosine_sensitivity)
        else:
            print("Entering else condition")
            num_clusters = int(request.form.get("numClusters"))
        # If the user does not select a file, the browser submits an
        # empty file without a filename.
        if file.filename == '':
            flash('No selected file')
            return redirect(request.url)
        if file and allowed_file(file.filename):
            filename = secure_filename(file.filename)
            #Run things here
            topics = read_file(file)
            data = prepare_data(topics)
            embeddings = run_model(data)
            #similarities = calculate_similarities(embeddings)
            topics = generate_cluster(topics, embeddings, num_clusters, cosine_sensitivity)
            save_output(topics)            
            print('Done')
            
            return  ''' <!doctype html>
    <title>Upload new File</title>
    <h1>Upload new File</h1>
    <form method=post enctype=multipart/form-data>
      <input type=file name=file accept = .csv>
      <input type=submit value=Upload>
    </form>
    '''
            
    return render_template("Clustering Page.html")

In [None]:
if __name__ == '__main__':
    app.run()