In [1]:
#get the data file with all years data from S3 for clustering
!aws s3 cp s3://ncw210data/FullTextData/patent_claim_data_all_years.csv /home/ubuntu/data/inputdata/patent_claim_data_all_years.csv

download: s3://ncw210data/FullTextData/patent_claim_data_all_years.csv to ../data/inputdata/patent_claim_data_all_years.csv


In [3]:
!head -1 /home/ubuntu/data/inputdata/patent_claim_data_all_years.csv

appl_doc_number,appl_country,appl_date,pub_doc_number,pub_date,number_of_days,invention_title,abstract,claim_text,claim_text_stemmed


In [1]:
# -*- coding: utf-8 -*-
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.decomposition import NMF
import csv

#clustering and small files creation
import pickle
import nltk
nltk.download('punkt')
import pandas as pd
import numpy as np

sb_stemmer = nltk.stem.SnowballStemmer('english')
stopwords = {'back', 'thru', 'eg', 'hereafter', 'too', 'part', 'which', 'will', 'be', 'thereupon', 'about', 'nevertheless', 'therein', 'through', 'we', 'among', 'in', 'then', 'former', 'via', 'below', 'whereafter', 'due', 'you', 'bill', 'forty', 'few', 'not', 'with', 'rather', 'next', 'nine', 'me', 'its', 'sometime', 'yours', 'who', 'whoever', 'down', 'some', 'such', 'thereafter', 'hasnt', 'fifteen', 'both', 'as', 'ever', 'could', 'find', 'hence', 'something', 'a', 'there', 'mostly', 'whereas', 'many', 'serious', 'can', 'indeed', 'afterwards', 'whenever', 'by', 'becomes', 'may', 'after', 'couldnt', 'seemed', 'anyhow', 'etc', 'might', 'already', 'no', 'please', 'them', 'myself', 'therefore', 'from', 'along', 'ltd', 'against', 'everywhere', 'amoungst', 'because', 'where', 'sixty', 'ie', 'although', 'sincere', 'move', 'seeming', 'or', 'wherever', 'inc', 'whatever', 'into', 'anywhere', 'around', 'nor', 'see', 'several', 'sometimes', 'for', 'interest', 'beyond', 'whether', 'detail', 'describe', 'moreover', 'nobody', 'whereupon', 're', 'without', 'an', 'ours', 'perhaps', 'only', 'five', 'towards', 'keep', 'eleven', 'one', 'other', 'any', 'otherwise', 'except', 'that', 'cannot', 'behind', 'ourselves', 'under', 'within', 'fifty', 'across', 'if', 'thus', 'per', 'wherein', 'here', 'empty', 'co', 'still', 'whole', 'how', 'off', 'to', 'yourself', 'call', 'cry', 'four', 'so', 'she', 'take', 'their', 'been', 'now', 'even', 'mill', 'what', 'another', 'namely', 'always', 'themselves', 'almost', 'six', 'formerly', 'ten', 'found', 'onto', 'yet', 'between', 'give', 'hers', 'herein', 'eight', 'above', 'anyway', 'third', 'himself', 'front', 'over', 'two', 'much', 'latter', 'itself', 'besides', 'those', 'on', 'twenty', 'up', 'us', 'amongst', 'beforehand', 'but', 'most', 'same', 'mine', 'should', 'this', 'full', 'herself', 'her', 'thick', 'con', 'everything', 'is', 'am', 'three', 'throughout', 'again', 'enough', 'your', 'once', 'hereupon', 'become', 'yourselves', 'everyone', 'before', 'i', 'whereby', 'others', 'must', 'seems', 'elsewhere', 'were', 'either', 'would', 'became', 'hundred', 'toward', 'very', 'latterly', 'top', 'often', 'beside', 'cant', 'else', 'the', 'however', 'and', 'somehow', 'him', 'noone', 'somewhere', 'our', 'nothing', 'de', 'fill', 'well', 'it', 'all', 'last', 'do', 'these', 'has', 'upon', 'every', 'side', 'system', 'put', 'thence', 'twelve', 'becoming', 'show', 'un', 'least', 'of', 'have', 'own', 'since', 'though', 'whither', 'out', 'hereby', 'meanwhile', 'none', 'while', 'whom', 'further', 'why', 'made', 'whose', 'my', 'someone', 'they', 'during', 'anyone', 'first', 'go', 'less', 'his', 'anything', 'thereby', 'amount', 'together', 'never', 'was', 'thin', 'also', 'each', 'fire', 'are', 'when', 'alone', 'had', 'until', 'done', 'more', 'at', 'than', 'nowhere', 'seem', 'whence', 'name', 'neither', 'he', 'get', 'being', 'bottom'}

def tokenize(doc):
    return doc.lower().split(" ")

def save_topics(model, feature_names, no_top_words,topics_filename):
    with open(topics_filename, 'w') as csvfile:
        for topic_idx, topic in enumerate(model.components_):
            csvwriter = csv.writer(csvfile, delimiter=',',quotechar='"', quoting=csv.QUOTE_MINIMAL)
            words = (" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
            #print(topic_idx, (" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])))
            csvwriter.writerow([topic_idx, words])

def create_clusters(filename,level,cluster_number):
    #df_data = pd.read_csv(filename,usecols=['application_number','uspc_class','uspc_subclass','claim_text_pr_stem'])
    #['appl_doc_number', 'appl_country', 'appl_date', 'pub_doc_number', 'pub_date', 'number_of_days','invention_title', 'abstract','claim_text','claim_text_stemmed']
    df_data = pd.read_csv(filename,encoding = 'utf-8')
    print('file loaded')
    df_data = df_data.drop_duplicates(subset=[ 'appl_doc_number'], keep=False)
    df_data = df_data.dropna(subset=['claim_text_stemmed'], how='all')
    
    my_words = ['','()','(),']
    my_stop_words = text.ENGLISH_STOP_WORDS.union(my_words)
 
    corpus_vectorizer = TfidfVectorizer(norm='l2', max_df=0.95, min_df=2, use_idf=True, smooth_idf=False, sublinear_tf=True,
                                    stop_words=set(my_stop_words),
                                    tokenizer=tokenize)

    corpus_tfidf = corpus_vectorizer.fit_transform(df_data['claim_text_stemmed'])
    print('vectorizer fit transform completed')
    
    no_topics = 10

    # Run NMF
    nmfmodel = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd')
    H = nmfmodel.fit_transform(corpus_tfidf)
    nmf = nmfmodel.components_;
    print('nmf fit transform completed')
    
    tfidf_feature_names = corpus_vectorizer.get_feature_names()

    num_top_words = 2000
    fname = '/home/ubuntu/data/clusterdata/topics_' + str(cluster_number) + '.csv'
    save_topics(nmfmodel, tfidf_feature_names, num_top_words, fname)

    # save the model to disk
    nmf_filename = '/home/ubuntu/data/clusterdata/nmf_model_' + str(cluster_number) + '.sav'
    pickle.dump(nmfmodel, open(nmf_filename, 'wb'))
    
    tfidf_filename = '/home/ubuntu/data/clusterdata/tfidf_model_' + str(cluster_number) + '.sav' 
    pickle.dump(corpus_vectorizer, open(tfidf_filename, 'wb'))
    
    print('models saved')
    
    #cluster_H_cols = np.around(np.matrix(H),decimals=4)
    #cluster_H_rank = np.array([cluster_H_cols.argmax(axis=1)])
    #cluster_H = np.concatenate((cluster_H_rank.T, cluster_H_cols), axis=1)

    cluster_proba_cols = np.matrix(H) * 100 /np.matrix(H).sum(axis=1)
    cluster_proba_cols = np.around(cluster_proba_cols,decimals=4)
    cluster_proba_rank = np.array([cluster_proba_cols.argmax(axis=1)])
    cluster_proba = np.concatenate((cluster_proba_rank.T, cluster_proba_cols), axis=1)

    np_appl = np.matrix(df_data[['appl_doc_number', 'appl_country', 'appl_date', 'pub_doc_number', 'pub_date', 'number_of_days','invention_title', 'abstract','claim_text','claim_text_stemmed']])

    # np_appl_H = np.concatenate((np_appl, cluster_H), axis=1)
    np_appl_proba = np.concatenate((np_appl, cluster_proba), axis=1)

    H_filename = '/home/ubuntu/data/clusterdata/all_cluster_data_H_' + str(cluster_number) + '.csv'
    probs_filename = '/home/ubuntu/data/clusterdata/all_cluster_data_probs_' + str(cluster_number) + '.csv'
    
    # np.savetxt(H_filename,np_appl_H, 
    #delimiter=",",fmt='%s,%s,%s,%s,%i,%3.4f,%3.4f,%3.4f,%3.4f,%3.4f,%3.4f,%3.4f,%3.4f,%3.4f,%3.4f')
    
    #numpy savetxt was giving lot of encoding errors so had to convert to pandas dataframe before writing to csv
    #np.savetxt(probs_filename,np_appl_proba,
    #delimiter=",",fmt='%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%i,%3.4f,%3.4f,%3.4f,%3.4f,%3.4f,%3.4f,%3.4f,%3.4f,%3.4f,%3.4f')
    df_probs = pd.DataFrame(np_appl_proba)
    df_probs.to_csv(probs_filename,index=False, sep=',')
       
    print('Probs saved')
    
    # split the probs file to small cluster files
    chunksize = 100000
    count = 0
    for df_data in pd.read_csv(probs_filename, sep = ',', header=None,encoding = "ISO-8859-1", chunksize=chunksize):
    #for df_data in pd.read_csv(probs_filename, sep = ',', header=None, encoding = "ISO-8859-1", chunksize=chunksize):
        df_data.columns = ['appl_doc_number', 'appl_country', 'appl_date', 'pub_doc_number', 'pub_date', 'number_of_days','invention_title', 'abstract','claim_text','claim_text_stemmed','top_cluster', 'pr_0','pr_1','pr_2','pr_3','pr_4','pr_5','pr_6','pr_7','pr_8','pr_9']

        filename = '/home/ubuntu/data/clusterdata/small_cluster_data_' + str(level) 

        if count == 0:
            df_data[df_data['top_cluster'] == 0].to_csv(filename + '_0.csv',index=False, sep=',') 
            df_data[df_data['top_cluster'] == 1].to_csv(filename + '_1.csv',index=False, sep=',') 
            df_data[df_data['top_cluster'] == 2].to_csv(filename + '_2.csv',index=False, sep=',') 
            df_data[df_data['top_cluster'] == 3].to_csv(filename + '_3.csv',index=False, sep=',') 
            df_data[df_data['top_cluster'] == 4].to_csv(filename + '_4.csv',index=False, sep=',') 
            df_data[df_data['top_cluster'] == 5].to_csv(filename + '_5.csv',index=False, sep=',') 
            df_data[df_data['top_cluster'] == 6].to_csv(filename + '_6.csv',index=False, sep=',') 
            df_data[df_data['top_cluster'] == 7].to_csv(filename + '_7.csv',index=False, sep=',') 
            df_data[df_data['top_cluster'] == 8].to_csv(filename + '_8.csv',index=False, sep=',') 
            df_data[df_data['top_cluster'] == 9].to_csv(filename + '_9.csv',index=False, sep=',') 
            count += 1
        else:    
            df_data[df_data['top_cluster']== 0].to_csv(filename + '_0.csv',mode = 'a', header= False, index=False, sep=',')
            df_data[df_data['top_cluster']== 1].to_csv(filename + '_1.csv',mode = 'a', header= False, index=False, sep=',')
            df_data[df_data['top_cluster']== 2].to_csv(filename + '_2.csv',mode = 'a', header= False, index=False, sep=',')
            df_data[df_data['top_cluster']== 3].to_csv(filename + '_3.csv',mode = 'a', header= False, index=False, sep=',')
            df_data[df_data['top_cluster']== 4].to_csv(filename + '_4.csv',mode = 'a', header= False, index=False, sep=',')
            df_data[df_data['top_cluster']== 5].to_csv(filename + '_5.csv',mode = 'a', header= False, index=False, sep=',')
            df_data[df_data['top_cluster']== 6].to_csv(filename + '_6.csv',mode = 'a', header= False, index=False, sep=',')
            df_data[df_data['top_cluster']== 7].to_csv(filename + '_7.csv',mode = 'a', header= False, index=False, sep=',')
            df_data[df_data['top_cluster']== 8].to_csv(filename + '_8.csv',mode = 'a', header= False, index=False, sep=',')
            df_data[df_data['top_cluster']== 9].to_csv(filename + '_9.csv',mode = 'a', header= False, index=False, sep=',')
    print('small files created')

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
#first level clustering
filename = '/home/ubuntu/data/inputdata/patent_claim_data_all_years.csv'
level = 'X'
cluster_number = 'X'

create_clusters(filename, level, cluster_number)

file loaded
vectorizer fit transform completed
nmf fit transform completed
models saved




Probs saved
small files created


In [None]:
# results_pr_stem = set()
# df_data['claim_text_pr_stem'].str.lower().str.split().apply(results_pr_stem.update)
# print(len(results_pr_stem))
# #print(results_pr_stem)


In [2]:
#second level clustering
filepath = '/home/ubuntu/data/clusterdata/small_cluster_data_X_'
for i in range(0, 10):
    filename = filepath + str(i) + '.csv'
    cluster_number = str(i)
    level = str(i)
    print(filename)
    create_clusters(filename, level, cluster_number)

/home/ubuntu/data/clusterdata/small_cluster_data_X_0.csv
file loaded
vectorizer fit transform completed
nmf fit transform completed
models saved




Probs saved
small files created
/home/ubuntu/data/clusterdata/small_cluster_data_X_1.csv
file loaded
vectorizer fit transform completed
nmf fit transform completed
models saved
Probs saved
small files created
/home/ubuntu/data/clusterdata/small_cluster_data_X_2.csv
file loaded
vectorizer fit transform completed
nmf fit transform completed
models saved
Probs saved
small files created
/home/ubuntu/data/clusterdata/small_cluster_data_X_3.csv
file loaded
vectorizer fit transform completed
nmf fit transform completed
models saved
Probs saved
small files created
/home/ubuntu/data/clusterdata/small_cluster_data_X_4.csv
file loaded
vectorizer fit transform completed
nmf fit transform completed
models saved
Probs saved
small files created
/home/ubuntu/data/clusterdata/small_cluster_data_X_5.csv
file loaded
vectorizer fit transform completed
nmf fit transform completed
models saved
Probs saved
small files created
/home/ubuntu/data/clusterdata/small_cluster_data_X_6.csv
file loaded
vectorizer fit

In [52]:
#function to create tfidf models for second level clustered files
def create_tfidf_for_small_files(filename,p,c):
    df_data = pd.read_csv(filename,usecols=['appl_doc_number', 'appl_country', 'appl_date', 'pub_doc_number', 'pub_date', 'number_of_days','invention_title', 'abstract','claim_text','claim_text_stemmed'])
    df_data = df_data.drop_duplicates(subset=[ 'appl_doc_number'], keep=False)
    df_data = df_data.dropna(subset=['claim_text_stemmed'], how='all')
    
    my_words = ['','()','(),']
    my_stop_words = text.ENGLISH_STOP_WORDS.union(my_words)
 
    corpus_vectorizer = TfidfVectorizer(norm='l2', max_df=0.95, min_df=2, use_idf=True, smooth_idf=False, sublinear_tf=True,
                                    stop_words=set(my_stop_words),
                                    tokenizer=tokenize)

    corpus_tfidf = corpus_vectorizer.fit_transform(df_data['claim_text_stemmed'])
    
    tfidf_filename = '/home/ubuntu/data/clusterdata/tfidf_model_' + str(p) + '_' +str(c) + '.sav' 
    pickle.dump(corpus_vectorizer, open(tfidf_filename, 'wb'))
    #pickle.dump(corpus_vectorizer.vocabulary_, open(tfidf_filename, 'wb'))

In [53]:
#create tfidf models for second level clustered files
for p in range(0,10):
    for c in range(0,10):
        filename = '/home/ubuntu/data/clusterdata/small_cluster_data_'+ str(p) + '_' +str(c) + '.csv' 
        create_tfidf_for_small_files(filename,p,c)

In [61]:
#copy all files to S3 ( this is only a backup with all intermediate files)
# !aws s3 cp /home/ubuntu/data/clusterdata/  s3://ncw210data/FullTextDataClusters/ --recursive

In [62]:
#copy all files to S3 (to be used by application and follow up models)
# !aws s3 cp /home/ubuntu/data/clusterdata/ s3://ncw210data/NewClusterFiles/topics --recursive --exclude "*" --include "topics_*.csv"
# !aws s3 cp /home/ubuntu/data/clusterdata/ s3://ncw210data/NewClusterFiles/tfidfmodels --recursive --exclude "*" --include "tfidf*.sav" 
# !aws s3 cp /home/ubuntu/data/clusterdata/ s3://ncw210data/NewClusterFiles/nmfmodels --recursive --exclude "*" --include "nmf*.sav" 
# !aws s3 cp /home/ubuntu/data/clusterdata/ s3://ncw210data/NewClusterFiles/smalldatafiles --recursive --exclude "*" --include "small_cluster_data_0*.csv"
# !aws s3 cp /home/ubuntu/data/clusterdata/ s3://ncw210data/NewClusterFiles/smalldatafiles --recursive --exclude "*" --include "small_cluster_data_1*.csv"    
# !aws s3 cp /home/ubuntu/data/clusterdata/ s3://ncw210data/NewClusterFiles/smalldatafiles --recursive --exclude "*" --include "small_cluster_data_2*.csv"    
# !aws s3 cp /home/ubuntu/data/clusterdata/ s3://ncw210data/NewClusterFiles/smalldatafiles --recursive --exclude "*" --include "small_cluster_data_3*.csv"    
# !aws s3 cp /home/ubuntu/data/clusterdata/ s3://ncw210data/NewClusterFiles/smalldatafiles --recursive --exclude "*" --include "small_cluster_data_4*.csv"    
# !aws s3 cp /home/ubuntu/data/clusterdata/ s3://ncw210data/NewClusterFiles/smalldatafiles --recursive --exclude "*" --include "small_cluster_data_5*.csv"
# !aws s3 cp /home/ubuntu/data/clusterdata/ s3://ncw210data/NewClusterFiles/smalldatafiles --recursive --exclude "*" --include "small_cluster_data_6*.csv"    
# !aws s3 cp /home/ubuntu/data/clusterdata/ s3://ncw210data/NewClusterFiles/smalldatafiles --recursive --exclude "*" --include "small_cluster_data_7*.csv"    
# !aws s3 cp /home/ubuntu/data/clusterdata/ s3://ncw210data/NewClusterFiles/smalldatafiles --recursive --exclude "*" --include "small_cluster_data_8*.csv"    
# !aws s3 cp /home/ubuntu/data/clusterdata/ s3://ncw210data/NewClusterFiles/smalldatafiles --recursive --exclude "*" --include "small_cluster_data_9*.csv"    