# Correlating Language to Geographic Location

After parsing a significant amount of geographically-linked phrases, now we'd like a way to predict the geographic origin of a speaker or writer via their language. Since no dataset can possibly cover the extent of U.S. dialects, we ought to utilize certain machine learning techniques to predict geographic origin even when the raw data is not entirely conclusive.

In order to accomplish this, we will first vectorize words and sentences, then utilize a Naive Bayes classifier (scikit-learn's implementation).

Please refer to and run `geodare.json`, which converts the raw DARE corpus into a more usable format, prior to using this notebook.

In [34]:
import sklearn
import numpy as np
import pandas as pd
import nltk

In [5]:
geodata = pd.read_csv("../data/cleaned_dare_corpus.csv")

In [9]:
catagories = []
for x in geodata['dialect']:
    if x not in catagories:
        catagories.append(x)

['alabama', 'alaska', 'algonquian', 'arizona', 'arkansas', 'bahamian', 'california', 'caribbean', 'cherokee', 'choctaw', 'colorado', 'connecticut', 'delaware', 'florida', 'georgia', 'gullah', 'hawaii', 'idaho', 'illinois', 'illinois, chicago', 'indiana', 'iowa', 'kansas', 'kentucky', 'louisiana', 'louisiana, new orleans', 'maine', 'maryland', 'maryland, baltimore', 'massachusetts', 'massachusetts, boston', 'massachusetts, cape cod', 'massachusetts, nantucket', 'michigan', 'minnesota', 'mississippi', 'missouri', 'montana', 'narraganset', 'nebraska', 'nevada', 'new hampshire', 'new jersey', 'new mexico', 'new york', 'new york city', 'new york, hudson valley', 'new york, long island', 'new york, upstate', 'newfoundland', 'north carolina', 'north dakota', 'ohio', 'oklahoma', 'oregon', 'pennsylvania', 'rhode island', 'south carolina', 'south carolina, charleston', 'south dakota', 'tennessee', 'texas', 'utah', 'vermont', 'virginia', 'washington', 'washington, dc', 'west virginia', 'wisconsin

In [45]:
targets = []

for i,x in enumerate(geodata['word']):
    dialect = geodata.get_value(i,'dialect')
    target = catagories.index(dialect)
    targets.append(target)
    
targets


[0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 7,
 8,
 8,
 8,
 8,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 10,
 10,
 10,
 10,
 10,
 11,
 11,
 11,
 11,