# Data Preparation

Takes in a .txt of the course names, outputs unigram, bigram, and trigram probabilities

In [4]:
import re
import numpy as np
import nltk

courses = []
with open('/Users/ericknudson/Desktop/Data Science Projects/SOM Course Name Generator/coursenames16.txt', encoding='utf-16') as fp:
    courses = fp.readlines()
courses = [x.strip('\"') for x in courses] 
mgt = re.compile("MGT*")
mgtcourses = []
for course in courses:
    if mgt.match(course):
        mgtcourses.append(course)

In [5]:
mgtcourses = [x.split() for x in mgtcourses]
nums = []
for i, course in enumerate(mgtcourses):
    nums.append(course[1])
    mgtcourses[i] = course[2:]
nums.remove("Successful")

In [6]:
mgtcourses = [" ".join(x) for x in mgtcourses]
uniquecourses = set(mgtcourses)
uniquecourses = list(uniquecourses)
uniquecourses.remove("(01) MGT 857: Digital Strategy")
uniquecourses.append("Digital Strategy")
uniquecourses.remove("MGT 697: Capitalism and its Critics")
uniquecourses.append("Capitalism and its Critics")
mgt = re.compile("GNAM*")
for i, course in enumerate(uniquecourses):
    if mgt.match(course):
#        print(course)
        course = course.split(":")
        course = course[1].strip('\"')
        uniquecourses[i] = course
uniquecourses = [x.strip(" ") for x in uniquecourses]

In [7]:
hbscourses = []
with open('/Users/ericknudson/Desktop/Data Science Projects/SOM Course Name Generator/HBScourses.txt') as fp:
    hbscourses = fp.readlines()
hbscourses = set(hbscourses)
hbscourses = list(hbscourses)

In [8]:
courselist = hbscourses + uniquecourses
for i, course in enumerate(courselist):
    course = course.replace(u'\xa0', u' ')
    course = course.replace(u'Application Only', u'')
    course = course.replace(u'America’s', u'America\'s')
    course = re.sub(r' and ', ' & ', course)
    courselist[i] = course

In [9]:
tokenizedcourses = [x.title() for x in courselist]
tokenizedcourses = [re.findall(r"[\w'-]+|[,!?:;&]", x) for x in tokenizedcourses]

In [11]:
import csv
w = csv.writer(open("tokenizedcourses.csv","w"))
for i in tokenizedcourses:
    w.writerow(i)

In [12]:
tokenizedcourses2 = []
with open('tokenizedcourses.csv') as csv_file:
    csv_reader = csv.reader(x.replace('\0', '') for x in csv_file)
    for row in csv_reader:
        tokenizedcourses2.append(row)

# Generate n-gram probability dictionaries

In [13]:
import math
from collections import Counter

START_SYMBOL = '*'
STOP_SYMBOL = 'STOP'

def calc_probabilities(training_corpus):
    unigram_counts = Counter()
    bigram_counts = Counter()
    trigram_counts = Counter()
    for course in training_corpus:

        #tokens = course.split()
        course = course.title()
        tokens = re.findall(r"[\w'-]+|[,!?:;&]", course)

        unigram_tokens = tokens + [STOP_SYMBOL]
        unigram_counts.update(unigram_tokens)

        bigram_tokens =  [START_SYMBOL] + tokens + [STOP_SYMBOL]
        bigram_tuples = list(nltk.bigrams(bigram_tokens))
        bigram_counts.update(bigram_tuples)

        trigram_tokens =  [START_SYMBOL] + [START_SYMBOL] + tokens + [STOP_SYMBOL]
        trigram_tuples = list(nltk.trigrams(trigram_tokens))
        trigram_counts.update(trigram_tuples)

    n_unigrams = sum(unigram_counts.values())
    n_bigrams = sum(bigram_counts.values())
    n_trigrams = sum(trigram_counts.values())

    unigram_counts = dict(unigram_counts)
    bigram_counts = dict(bigram_counts)
    trigram_counts = dict(trigram_counts)

    #unigram_probabilities = {k:math.log(x / n_unigrams,2) for (k,x) in unigram_counts.items()}
    unigram_probabilities = {}
    for (unigram, count) in unigram_counts.items():
        #prob = math.log(count,2) - math.log(n_unigrams,2)
        prob = count / n_unigrams
        unigram_probabilities.update({unigram: prob})

    bigram_probabilities = {}
    for (bigram, count) in bigram_counts.items():
        if bigram[0] == START_SYMBOL:
            #prob = math.log(count,2) - math.log(unigram_counts[STOP_SYMBOL],2) #set denom to number of sentences
            prob = count / unigram_counts[STOP_SYMBOL]
            bigram_probabilities.update({bigram: prob})
        else:
            #prob = math.log(count,2) - math.log(unigram_counts[bigram[0]],2)
            prob = count / n_bigrams
            bigram_probabilities.update({bigram: prob})

    trigram_probabilities = {}
    for (trigram, count) in trigram_counts.items():
        if trigram[0:2] == (START_SYMBOL,START_SYMBOL):
            #prob = math.log(count,2) - math.log(unigram_counts[STOP_SYMBOL],2) #set denom to number of sentences
            prob = prob = count / unigram_counts[STOP_SYMBOL]
            trigram_probabilities.update({trigram: prob})
        else:
            #prob = math.log(count,2) - math.log(bigram_counts[trigram[0:2]],2)
            prob = count / n_trigrams
            trigram_probabilities.update({trigram: prob})
    return unigram_probabilities, bigram_probabilities, trigram_probabilities

In [14]:
unigram_probabilities, bigram_probabilities, trigram_probabilities = calc_probabilities(courselist)

# N-Gram Name Generators

### unigram model

In [246]:
from numpy.random import choice
output = []
next_token = ""
while next_token != 'STOP':
    next_token = choice(np.array(list(unigram_probabilities.keys())), 1, p = np.array(list(unigram_probabilities.values())))[0]
    output.append(next_token)
output = output[0:-1] #remove stop
new_name = " ".join(output)
num = choice(nums,1)[0]
new_name = "MGT " + num + " " + new_name 
new_name

'MGT 955-01 For New Evolution Fund Strategic'

### bigram model

In [19]:
from numpy.random import choice
def gen_bigram_course(*args):
    bigram_probabilities = args[0]
    tokenized_courses = args[1]
    output = ["*"]
    next_token = ""
    i = 1
    while next_token != 'STOP':
        current_history = output[i-1]
        possible_bigrams = [(key, value) for key, value in bigram_probabilities.items() if key[0] == current_history]
        bigrams = [x[0] for x in possible_bigrams]
        bigrams = [x[1] for x in bigrams]
        probs = [x[1] for x in possible_bigrams]
        sprobs = np.sum(probs)
        probs = np.divide(probs,sprobs)
        if i == 1:
            next_token = choice(np.array(bigrams), 1)[0]
        else:
            #next_token = choice(np.array(bigrams), 1)[0]
            next_token = choice(np.array(bigrams), 1, p = np.array(probs))[0]
        #look back 2, choose bigram based on distribution of bigrams with those first two words in common
        output.append(next_token)
        i = i + 1
    output = output[1:-1] #remove stop
    if output in tokenized_courses:
        print("dupe:", output)
        output = gen_bigram_course(bigram_probabilities, tokenized_courses)
    return(output)
        
def put_course_name_together(output):
    new_name = " ".join(output)
    new_name = new_name.replace(" ,",",")
    new_name = new_name.replace(" :",":")
    new_name = new_name.replace(" ?","?")
    num = choice(nums,1)[0]
    new_name = "MGT " + num + " " + new_name 
    return(new_name)

In [24]:
output = gen_bigram_course(bigram_probabilities, tokenizedcourses)
course = put_course_name_together(output)
print(course)

MGT 612-01 Building & Literature


In [None]:
courselis

### trigram model

In [76]:
def gen_trigram_course(*args):
    trigram_probabilities = args[0]
    tokenized_courses = args[1]
    output = ["*", "*"]
    next_token = ""
    i = 2
    while next_token != 'STOP':
        current_history = tuple(output[i-2:i])
        possible_trigrams = [(key, value) for key, value in trigram_probabilities.items() if key[0:2] == current_history]
        trigrams = [x[0][2] for x in possible_trigrams]
        probs = [x[1] for x in possible_trigrams]
        sprobs = np.sum(probs)
        probs = np.divide(probs,sprobs)
        if i == 2:
            next_token = choice(np.array(trigrams), 1)[0]
        else:
            #next_token = choice(np.array(trigrams), 1, p = np.array(probs))[0]
            next_token = choice(np.array(trigrams), 1)[0]
        output.append(next_token)
        i = i + 1
    output = output[2:-1] #remove stop
    if output in tokenized_courses:
        print("dupe:", output)
        output = gen_course(bigram_probabilities, tokenized_courses)
    return(output)

In [304]:
output = gen_trigram_course(trigram_probabilities, tokenizedcourses)
course = put_course_name_together(output)
print(course)

dupe: ['Product', 'Management', '101']
dupe: ['Investment', 'Management']
dupe: ['Interpersonal', 'Dynamics']
MGT 873-01 Inclusive Economic Development


# Generate Databases for Web App

bigrams and nums to files

In [213]:
import csv

w = csv.writer(open("bigram_probabilities.csv","w"))
for (key, val) in bigram_probabilities.items():
    w.writerow([key[0], key[1], val])
w = csv.writer(open("nums.csv","w"))
for i in nums:
    w.writerow(i)

In [211]:
bigram_probabilities[("&","Society")] = bigram_probabilities[("&","Society")] * 2

In [212]:
bigram_probabilities[("&","Society")]

0.007782101167315175