<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Vendor-Classifier" data-toc-modified-id="Vendor-Classifier-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Vendor Classifier</a></span><ul class="toc-item"><li><span><a href="#VC-functions-and-packages" data-toc-modified-id="VC-functions-and-packages-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>VC functions and packages</a></span></li><li><span><a href="#VC-Variables-declaration" data-toc-modified-id="VC-Variables-declaration-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>VC Variables declaration</a></span></li><li><span><a href="#VC-Main" data-toc-modified-id="VC-Main-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>VC Main</a></span></li></ul></li><li><span><a href="#Cosine-Similarity-for-Company-Names" data-toc-modified-id="Cosine-Similarity-for-Company-Names-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Cosine Similarity for Company Names</a></span><ul class="toc-item"><li><span><a href="#CS-functions-and-packages" data-toc-modified-id="CS-functions-and-packages-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>CS functions and packages</a></span></li><li><span><a href="#CS-Variable-Declaration" data-toc-modified-id="CS-Variable-Declaration-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>CS Variable Declaration</a></span></li><li><span><a href="#CS-Main" data-toc-modified-id="CS-Main-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>CS Main</a></span></li></ul></li></ul></div>

# Vendor Classifier

## VC functions and packages

In [None]:
"""
This is the vendor classifier that classifies companies with different
service categories according to their service types"""

import sys
import pandas as pd
import numpy as np
from autocorrect import spell
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn import linear_model
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
import warnings
warnings.filterwarnings('ignore')


"""
    Function for reading input file
    input:
        filename (string): name of the file
    output:
        df (dataframe): input dataframe
"""
#read input file
def read_txt(filename):
    if filename.split('.')[1] == 'csv':
        df = pd.read_csv(r'vc_input_train.csv')
        return df
    elif filename.split('.')[1] == 'txt':
        with open("vc_input_train.txt", encoding='utf-16') as f:
            contents = f.read()
        if sys.version_info[0] < 3: 
            from StringIO import StringIO
        else:
            from io import StringIO
        TESTDATA=StringIO(contents)
        df = pd.read_csv(TESTDATA, sep="|", error_bad_lines=False)
        return df
    else:
        print("file type non recognized")
        

"""
    Dataframe preprocess helper function
    input:
        df (dataframe): input dataframe
        col (string): column name that contains the original service types
    output:
        df_pp (dataframe): processed dataframe
"""
#Dataframe pre-processing
def preprocess(df,col):
    df_pp=df.copy()
    df_pp=df_pp.dropna()    #dropping any rows with NaN values
    df_pp[col]=df_pp[col].str.lower()    #lower all case in OrigService
    df_pp[col]=df_pp[col].str.replace('[^A-Za-z\s]+', ' ')    #remove all special characters
    df_pp[col]=df_pp[col].str.replace('services','')    #remove word: "services" from string
    df_pp[col]=df_pp[col].str.replace('service','')    #remove word: "service" from string
    df_pp[col]=df_pp[col].str.replace('fees','')    #remove word: "fees" from string
    df_pp=df_pp.reset_index()
    return df_pp

"""
    Spell check function
    input:
        df (dataframe): input dataframe
        col (string): column name that contains the original service types
    output:
        df_ac (dataframe): spell checked dataframe
"""
#Call spell check if necessary
def spellcheck(df,col):
    df_ac=df.copy()
    temp = df_ac[col].tolist()
    for i in range(len(temp)):
        temp[i] = spell(temp[i])
    df_ac[col] = temp
    return df_ac

"""
    Spell check function
    input:
        df (dataframe): input dataframe
        orig (string): column name that contains the original service types
        final (string): column name that contains the labeled service types
        tsize (float): test to train ratio
        rstate (integer): random state number
    output:
        all_labels (list): predicted labels
        Accuracy (float): accuracy of the model
        X_test.index (list): index of the predicted labels
"""
#Classifier pipeline
def model(df, orig, final, tsize, rstate):
    mlb = MultiLabelBinarizer()
    classifier = Pipeline([
        ('vectorizer', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', OneVsRestClassifier(linear_model.LogisticRegression(class_weight='balanced')))])
    X_train, X_test, Y_train, Y_test = train_test_split(df[orig], df[final], test_size=tsize, random_state=rstate)
    for i in Y_train.index:
        Y_train[i]=[Y_train[i]]
    Y_train = mlb.fit_transform(Y_train)
    classifier.fit(X_train, Y_train)
    predicted = classifier.predict(X_test)
    all_labels = mlb.inverse_transform(predicted)
    for i in range(0,len(all_labels)):
        all_labels[i]=' '.join(all_labels[i])
    c = (np.array(Y_test.reset_index()[final]) == np.array(all_labels))
    Accuracy = sum(c)/len(Y_test)*100
    print('Accuracy = ' + str(round(Accuracy, 2)) + '%')  #>80% will be fine as the model is evaluated with another metric
    return all_labels, Accuracy, X_test.index

"""
    Helper function that writes output to csv
    input:
        df (dataframe): output data frame
        ind (list): index of the predicted labels
        pred (list): predicted labels
"""
#output function
def output_df(df, ind, pred):
    df['Predictions']=''
    df['Predictions'][ind]=pred
    df = df.drop(['index'], axis = 1)
    df.to_csv('vc_output.csv', sep=',', index=False)

## VC Variables declaration

In [None]:
#Variable declaration:
inputfile = 'vc_input.csv'
outputfile = 'vc_output.csv'
orig = 'OrigService'
final = 'FinalService'
tsize = 0.5
rstate = 24
scheck = 'off'

## VC Main

In [None]:
"""Main function"""

#Main function
df = read_txt(inputfile)
df = preprocess(df,orig)
pred, acc, ind = model(df, orig, final, tsize, rstate)
output_df(df, ind, pred)

# Cosine Similarity for Company Names

## CS functions and packages

In [1]:
"""
This is the algorithm that group companies by the most commonly mentioned
name using cosine similarity"""

import sys
import pandas as pd
import numpy as np
import operator
import re, math
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

"""
    Function for reading input file
    input:
        filename (string): name of the file
    output:
        df (dataframe): input dataframe
"""
#read input file
def read_txt(filename):
    if filename.split('.')[1] == 'csv':
        df = pd.read_csv(r'vc_input_train.csv')
        return df
    elif filename.split('.')[1] == 'txt':
        with open("vc_input_train.txt", encoding='utf-16') as f:
            contents = f.read()
        if sys.version_info[0] < 3: 
            from StringIO import StringIO
        else:
            from io import StringIO
        TESTDATA=StringIO(contents)
        df = pd.read_csv(TESTDATA, sep="|", error_bad_lines=False)
        return df
    else:
        print("file type non recognized")
        
"""
    Helper function for removing null entries
    input:
        df (dataframe): dataframe containing the company names
        final_serv (string): column name from input that lists the service categories
    output:
        df_nona (dataframe): dataframe with no null entries
"""
#Remove null entries
def remove_null(df, final_serv):
    df_nona = df[df[final_serv].isnull()==0]
    return df_nona

"""
    Helper function for grouping company names according to their service categories
    input:
        df (dataframe): dataframe containing the company names
        g (integer): iteration number
        col (string): column name that contains the original company names
    output:
        df_type (dataframe): dataframe for each service category
"""
#Seperate the companies according to their service categories
def create_df_for_each_service_type(dfg, g, col):
    df_type = dfg.get_group(gp_names[g]).sort_values(col).reset_index()
    df_type[col] = df_type[col].str.upper()
    df_type['Grouped Names'] = df_type[col]
    return df_type

"""
    Helper function for creating a list of company names from the input dataframe
    input:
        df (dataframe): dataframe containing the company names
        col (string): column name that contains the original company names
    output:
        companylist (list): list containing company names in each category
"""
#Creat list that contains company names
def create_company_vectors(df, col):
    templist = np.asarray(df[col])
    companylist=[]
    for w1 in templist:
        companylist = companylist + [text_to_vector(w1)]
    return companylist

"""
    Function that compare company names with consine similiarity and replace names with most commonly mentioned name
    input:
        df (dataframe): dataframe containing the company names
        col (string): column name that contains the original company names
        cos_similarity (matrix): cosine similiarity matrix
        cos_damp (float): cosine similarity damping coefficient
    output:
        df (dataframe): dataframe with companies name replaced
"""
#Compare names with cosine similiarity
def compare_company_names(df, col, cos_similarity, cos_damp):
    companylist=np.asarray(df[col])
    skip=[]
    for j in range(len(companylist)):
        if j not in skip:
            templist=[]
            tempid=[]
            for i in range(0,len(companylist)):
                if (cos_similarity[j]<-cos_damp).tolist()[i] == True:
                    templist = templist + [companylist[i]]
                    tempid = tempid + [i]
            if len(templist)>1:
                listf=[]
                idf=[]
                for k in range(0,len(templist)):
                    if companylist[j][0] == templist[k][0]:
                        listf = listf + [templist[k]]
                        idf = idf + [tempid[k]]
                for m in idf:
                    df['Grouped Names'][m]=max(listf, key=listf.count)
            skip = skip + tempid
    return df

"""
    Helper function that calculates cosine angle
    input:
        vec1 (vector): vector 1
        vec2 (vector): vector 2
    output:
        cosine angle
"""
#Helper function to calculate cosine angle
def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])
    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)
    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator
    
"""
    Helper function that transform text to vector
    input:
        text (string): text string
    output:
        word vector
"""
#Helper function to transform text to vector
def text_to_vector(text):
    words = re.compile(r'\w+').findall(text)
    return Counter(words)

"""
    Helper function that writes output to csv
    input:
        df_print (dataframe): output data frame
        gp_names (string): output service category
        i (integer): iteration number
"""
#Printing output to csv file
def print_output(df_print, gp_names, i):
    filename = gp_names[i].replace('/', '_') + '.csv'
    df_print.to_csv(filename, sep=',', index=False)

## CS Variable Declaration

In [None]:
#Variable Declaration
inputfile  = 'cs_input.csv'       #Name of input file
final_serv = 'FinalService'       #Column name for final service type of each company
orig_name  = 'origName'           #Column name consisting original company names
cos_damp   = 0.5                  #Cosine similarity threshold

## CS Main

In [None]:
"""Main function"""

#Read input
df = read_txt(inputfile)
#Remove null entries
df_nona = remove_null(df, final_serv)
#Create groupby object by grouping by service types
dfg = df_nona.groupby(final_serv)
gp_names = list(dfg.groups.keys())

#For loop to compare company names for each service type
for g in range(len(gp_names)):
    #Create a dataframe with data from each service type
    df_type = create_df_for_each_service_type(dfg, g, orig_name)
    #Create company name vectors for each entry in the dataframe
    company_vec = create_company_vectors(df_type, orig_name)
    #Calculate cosine similiarity matrix
    cos_similarity = -1*np.array([[get_cosine(w1,w2) for w1 in company_vec] for w2 in company_vec])
    #Compare company names using cosine similiarity matrix and return unified names for each company to dataframe
    df_print = compare_company_names(df_type, orig_name, cos_similarity, cos_damp)
    #Print dataframe containing unified company names to csv for each service type
    print_output(df_print, gp_names, g)