## Classification of stock tickers 

In [15]:
import os
import requests
import io
import xlrd 
import pandas as pd
import random as rd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn import metrics

# cd drive/'My Drive'/'Colab Notebooks'

In [19]:
# Import data from github;
url = "https://github.com/clairepaoli/NLP_stock_tickers/blob/main/data/stata_data.csv?raw=true" 
download = requests.get(url).content

df = pd.read_csv(io.StringIO(download.decode('utf-8')))
print (df.head())

# Alternatively, save locally and import;
# df = pd.read_excel(".../data/stata_data.xlsx")

                             Name fund_ticker Percent Pronounceable deg_level  \
0           Fidelity® Contrafund®       FCNTX                22.70%        MA   
1  T. Rowe Price Blue Chip Growth       TRBCX                21.67%        MA   
2      T. Rowe Price Growth Stock       PRGFX                23.68%        MA   
3        Fidelity® Growth Company       FDGRX                24.14%        MA   
4    Vanguard Dividend Growth Inv       VDIGX                17.39%        MA   

  deg_subject  CFA   exp         ManagerName      category          size  \
0     Finance   No  32.0      William Danoff  Large Growth  1.353730e+11   
1     Finance  Yes  27.0     Larry J. Puglia  Large Growth  5.832290e+10   
2     Finance   No  18.0      Joseph B. Fath  Large Growth  5.612382e+10   
3     Finance   No  29.0     Steven S. Wymer  Large Growth  4.827930e+10   
4     Finance   No  22.0  Donald J. Kilbride   Large Blend  3.434094e+10   

   TotalRetRankCat3Ymoendm ms_rating     esg_rating  \
0

In [20]:
# Create necessary variables;
funds = list(df["fund_ticker"])
total = 0

In [16]:
# Pronounceable share;
def pct_pron(list):
    count = 0
    for i in list:
        if i == "pronounceable":
            count += 1
    return count/len(list)

def scramble(s):
    return "".join(rd.sample(s, len(s)))

In [1]:
import nltk
nltk.download()


showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


In [8]:
from nltk.corpus import words
word_list = words.words()

word_list[:10]

['A',
 'a',
 'aa',
 'aal',
 'aalii',
 'aam',
 'Aani',
 'aardvark',
 'aardwolf',
 'Aaron']

In [31]:
pron = [word.strip() for word in word_list if word == word.lower()]
pron[:10]

['a',
 'aa',
 'aal',
 'aalii',
 'aam',
 'aardvark',
 'aardwolf',
 'aba',
 'abac',
 'abaca']

In [32]:
# Create set of pronounceable and non-pronounceable;

# unpron = [scramble(word) for word in pron]
import random
unpron = [''.join(random.sample(word, len(word))) for word in pron]

unpron[:10]

['a',
 'aa',
 'laa',
 'liaia',
 'ama',
 'rkaraadv',
 'orflawad',
 'aba',
 'abca',
 'aacab']

In [27]:
#pron = pd.DataFrame (pron, columns = ['pron'])
#unpron = pd.DataFrame (unpron, columns = ['unpron'])

#pron.head()

Unnamed: 0,pron
0,a
1,aa
2,aal
3,aalii
4,aam


We then use the **train_test_split** function to randomly split our data. The first argument will be the feature data, the second the targets or labels. The test_size keyword argument specifies what proportion of the original data is used for the test set. Lastly, the random_state sets a seed for the random number generator that splits the data into train and test. Setting the seed with the same argument later will allow you to reproduce the exact split and your downstream results. train test split returns four arrays: the training data, the test data, the training labels, and the test labels.

We specify the size of the test to 30%.

It is also best practice to perform your split so that the split reflects the labels on your data. That is, you want the labels to be distributed in train and test sets as they are in the original dataset. To achieve this, we use the keyword argument stratify = y, where y the list or array containing the labels.

In [34]:
X = pron + unpron
y = ['pronounceable']*len(pron) + ['unpronounceable']*len(unpron)

In [35]:
# Divide data between train and test set;
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 21, stratify = y)


In [38]:
len(X_train)

296150

In [39]:
len(X_test)

126922

In order to use textual data for predictive modeling, the text must be parsed to remove certain words – this process is called **tokenization**. These words need to then be encoded as integers, or floating-point values, for use as inputs in machine learning algorithms. This process is called feature extraction (or vectorization).

In [40]:
# Example of using CountVectorizer;

# list of text documents
text = ["John is a good boy. John watches basketball"]

vectorizer = CountVectorizer()
# tokenize and build vocab
vectorizer.fit(text)

print(vectorizer.vocabulary_)

# encode document
vector = vectorizer.transform(text)
# summarize encoded vector
print(vector.shape)
print(vector.toarray())

{'john': 4, 'is': 3, 'good': 2, 'boy': 1, 'watches': 5, 'basketball': 0}
(1, 6)
[[1 1 1 1 2 1]]


We use the MultinomialNB classifier, a Naive Bayes classifier for multinomial models. The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification).

In [41]:
# Build classifier using bigrams and trigrams of words in dictionary
classify = Pipeline([
    ('vect', CountVectorizer(analyzer='char', ngram_range=(1, 3))),
    ('clf', MultinomialNB())
    ])

In [42]:
# Fit and test the algorithm;
classify = classify.fit(X_train, y_train)
y_pred = classify.predict(X_test)

In [43]:
# Print test results
print('Accuracy Score:', metrics.accuracy_score(y_test, y_pred))
print('Classification Report:')
print(metrics.classification_report(y_test, y_pred))


Accuracy Score: 0.923015710436331
Classification Report:
                 precision    recall  f1-score   support

  pronounceable       0.91      0.93      0.92     63461
unpronounceable       0.93      0.91      0.92     63461

       accuracy                           0.92    126922
      macro avg       0.92      0.92      0.92    126922
   weighted avg       0.92      0.92      0.92    126922



In [None]:
# Read in each set of holdings data and apply algorithm to each of the funds' holdings;
output_file = open(".../result.csv", "w")
matching = []

for fund in funds:
    try:
        fund_df = pd.read_excel("/Users/jake/Desktop/Fine 547 Holdings/" + fund + ".xlsx")
        total += 1
    except:
        continue
    
    fund_ticks = list(fund_df["Unnamed: 1"][5:])
    est_pron = classify.predict(fund_ticks)
    
    output = ['NaN', 'NaN', 'NaN', 'Pronouncable', pct_pron(est_pron)] + [x for x in est_pron]
    
    fund_df['pronounceable'] = output
    
    matching.append((fund, pct_pron(est_pron)))

In [None]:
#print(matching)
# Write results into a new the output file
for (f, v) in matching:
    output_file.write(str(f) + "," + str(v))
    output_file.write("\n")
output_file.close()