In [3]:
import xlrd 
import pandas as pd
import random as rd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn import metrics

#Make set of labeled data using disctionary words and random scrambles of them
def scramble(s):
    return "".join(rd.sample(s, len(s)))

pron = [word.strip() for word in open('/usr/share/dict/words') if word == word.lower()]
unpron = [scramble(word) for word in pron]

X = pron+unpron
y = ['pronounceable']*len(pron) + ['unpronounceable']*len(unpron)

X_train, X_test, y_train, y_test = train_test_split(X, y)

#Build classification algorithm using bigrams and trygrams of words in teh dictionary
classify = Pipeline([
    ('vect', CountVectorizer(analyzer='char', ngram_range=(1, 3))),
    ('clf', MultinomialNB())
    ])

#Fit and test the algorithm
classify = classify.fit(X_train, y_train)
est_class = classify.predict(X_test)

#Print test results
print('Accuracy Score:', metrics.accuracy_score(y_test, est_class))
print('Classification Report:')
print(metrics.classification_report(y_test, est_class))



#Import data file
df1=pd.read_excel("/Users/jake/Desktop/Fine 547 Stata.xlsx")

#Make percent pronounceable function for later
def pct_pron(list):
    count = 0
    for i in list:
        if i == "pronounceable":
            count += 1
    return count/len(list)

#Create necessary variables and arrays, Assign stuff to data files
funds = list(df1["fund_ticker"])
total = 0
output_file = open("/Users/jake/Desktop/Fine 547 Holdings/result.csv", "w")
matching = []

#Read in each set of holdings data and apply algorthim to each of the funds' holdings
for fund in funds:
    try:
        fund_df = pd.read_excel("/Users/jake/Desktop/Fine 547 Holdings/" + fund + ".xlsx")
        total += 1
    except:
        continue
    
    fund_ticks = list(fund_df["Unnamed: 1"][5:])
    est_pron = classify.predict(fund_ticks)
    
    output = ['NaN', 'NaN', 'NaN', 'Pronouncable', pct_pron(est_pron)] + [x for x in est_pron]
    
    fund_df['prounouncable'] = output
    
    matching.append((fund, pct_pron(est_pron)))

#print(matching)
#Write results into a new the output file
for (f, v) in matching:
    output_file.write(str(f) + "," + str(v))
    output_file.write("\n")
output_file.close()

Accuracy Score: 0.9237260783718104
Classification Report:
                 precision    recall  f1-score   support

  pronounceable       0.92      0.93      0.92     52596
unpronounceable       0.93      0.91      0.92     52748

    avg / total       0.92      0.92      0.92    105344

