# Correlating Language to Geographic Location

After parsing a significant amount of geographically-linked phrases, now we'd like a way to predict the geographic origin of a speaker or writer via their language. Since no dataset can possibly cover the extent of U.S. dialects, we ought to utilize certain machine learning techniques to predict geographic origin even when the raw data is not entirely conclusive.

In order to accomplish this, we will first vectorize words and sentences, then utilize a Naive Bayes classifier (scikit-learn's implementation).

Please refer to and run `geodare.json`, which converts the raw DARE corpus into a more usable format, prior to using this notebook.

In [8]:
import sklearn
import numpy as np
import pandas as pd
import nltk
import os

## For DARE corpus only

In [39]:
'''Read in the cleaned DARE corpus
'''
geodata = pd.read_csv("../data/cleaned_dare_corpus.csv")

In [40]:
'''Create classification categories, i.e. target names
'''
catagories = []
for x in geodata['dialect']:
    if x not in catagories:
        catagories.append(x)
print(catagories)

['alabama', 'alaska', 'algonquian', 'arizona', 'arkansas', 'bahamian', 'california', 'caribbean', 'cherokee', 'choctaw', 'colorado', 'connecticut', 'delaware', 'florida', 'georgia', 'hawaii', 'idaho', 'illinois', 'illinois, chicago', 'indiana', 'iowa', 'kansas', 'kentucky', 'louisiana', 'louisiana, new orleans', 'maine', 'maryland', 'maryland, baltimore', 'massachusetts', 'massachusetts, boston', 'massachusetts, cape cod', 'massachusetts, nantucket', 'michigan', 'minnesota', 'mississippi', 'missouri', 'montana', 'narraganset', 'nebraska', 'nevada', 'new hampshire', 'new jersey', 'new mexico', 'new york', 'new york city', 'new york, hudson valley', 'new york, long island', 'new york, upstate', 'newfoundland', 'north carolina', 'north dakota', 'ohio', 'oklahoma', 'oregon', 'pennsylvania', 'rhode island', 'south carolina', 'south carolina, charleston', 'south dakota', 'tennessee', 'texas', 'utah', 'vermont', 'virginia', 'washington', 'washington, dc', 'west virginia', 'wisconsin', 'wyomin

In [41]:
'''Create data point and target lists
'''
examples = []
targets = []

for i,x in enumerate(geodata['word']):
    if pd.notnull(x):
        examples.append(x)
        dialect = geodata.get_value(i,'dialect')
        target = catagories.index(dialect)
        targets.append(target) 


In [42]:
'''Sanity check: the number of data points should match the number of targets.

    print("data: ",len(examples))
    print("targets: ",len(targets))
 
    Then, bunch the data points to their targets.
'''

training = sklearn.datasets.base.Bunch(target=targets, data=examples, target_names=catagories)
print(training.target_names[training.target[36954]])

south carolina


## For SB translations only

In [48]:
''' !!! IMP: TEMPORARY. IMPORT METHOD FROM lexicon.ipynb ONCE CONVERTED INTO PYTHON SCRIPT

Converts a string listing state abbreviations to a string listing state names.
ex. "ak, al, ar" --> "alaska, alabama, arkansas"

''' 

def abbrev_to_state(abbreviations):
    '''Takes in a comma seperated string that lists state abbreviations.
    Outputs a comma seperated string that lists full state names.
    
    Ex. The call 'abbrev_to_state("ak, al, ar") will return a string:
    
    "alaska, alabama, arkansas"
    
    '''
    output = ""
    abbrevs = abbreviations.split(", ")
    states = {
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MP': 'Northern Mariana Islands',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NA': 'National',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming'
    }
    for state in abbrevs:
        name = states[state.upper()]
        output += name.lower() + ", "
    return output[:-2]

In [54]:
'''Load all transcription .csv data into one DataFrame
'''

directory = "../data/SBData/dialectsdata"
transcripts = pd.DataFrame({"name":[], "dialect":[], "word":[]})

for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith(".csv") and filename.startswith("dialects"): 
        currpath = os.path.join(directory, filename)
        currdata = pd.read_csv(currpath, names=["name", "dialect", "word"])
        transcripts = pd.concat([transcripts, currdata])
        continue
    else:
        continue
transcripts = transcripts.reset_index()
transcripts = transcripts.drop("index", axis = 1)
transcripts

Unnamed: 0,dialect,name,word
0,CA,LENORE,So you don't need to go borrow equipment from ...
1,MT,LYNNE,H YWN Well we're gonna have to find somewhere ...
2,MT,DORIS,So Mae
3,MT,LYNNE,I'm gonna Hx
4,MT,DORIS,Mae Lynne XX
5,MT,LYNNE,H We're not gonna do the feet today I'm gonna ...
6,CA,LENORE,Did they train you
7,MT,LYNNE,yeah
8,CA,LENORE,Did they train you that XX
9,MT,LYNNE,yeah yeah


In [77]:
'''Clean DataFrame
'''
SBexamples = []
SBtargets = []

for i,x in enumerate(transcripts['word']):
    if pd.notnull(x) and pd.notnull(transcripts.get_value(i, 'dialect')):
        try:
            dialect = transcripts.get_value(i,'dialect')
            dialect = abbrev_to_state(dialect)
            target = catagories.index(dialect)
            SBexamples.append(x)
            SBtargets.append(target) 
        except KeyError:
            pass
            # These speakers do not identify with a single state, discard.

# sanity check: print(len(SBexamples), len(SBtargets)) should equal

california wisconsin


In [None]:
'''Bunch SBexamples to their targets and targetnames'''
SBtraining = sklearn.datasets.base.Bunch(target=SBtargets, data=SBexamples, target_names=catagories)

# should print 'california wisconsin'
print(SBtraining.target_names[SBtraining.target[0]], SBtraining.target_names[SBtraining.target[21294]])

## Back to classification (both)

In [6]:
df = pd.DataFrame(training.data)
df.astype('U').values.ravel()
df.head()

Unnamed: 0,0
0,gooselock
1,swale
2,tickbird
3,twistification
4,ahkio


In [7]:
'''Vectorize text data by the number of occurances (count)
'''
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(df.astype('U').values.ravel())
X_train_counts.shape


(36955, 2951)

In [8]:
'''Normalize vectorized data with Term Frequency times Inverse Document Frequency (tfidf)
'''
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(36955, 2951)

In [9]:
'''Train a Naive Bayes classifier on the existing data
'''
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB().fit(X_train_tfidf, training.target)

In [10]:
'''Example of utilizing the NB classifier to predict the geographic origin of untrained data points.
'''
docs_new = ['What a gooselock!', 'Scoot your tush over.', "That's a hosey. Don't barf up your frappe."]
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = classifier.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, training.target_names[category]))

'What a gooselock!' => alabama
'Scoot your tush over.' => virginia
"That's a hosey. Don't barf up your frappe." => massachusetts
