In [1]:
### Data handling imports
import pandas as pd
import numpy as np
from pandarallel import pandarallel

pandarallel.initialize()
import reverse_geocoder as rg
import unidecode

### Plotting imports
import matplotlib.pyplot as plt
import seaborn as sns
import copy

sns.set()
%matplotlib inline
%load_ext lab_black
import os
from collections import Counter
from collections import OrderedDict
from operator import itemgetter

import json

# Statistics imports
import scipy, scipy.stats

pd.set_option("display.max_columns", None)

import warnings

warnings.filterwarnings("ignore")


class color:
    BOLD = "\033[1m"
    UNDERLINE = "\033[4m"
    END = "\033[0m"

INFO: Pandarallel will run on 28 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
inputFolder = "../twitter-data-geo-output"
outputFolder = "./twitter-data-demographics-output"

In [3]:
def read_json(filepath):
    with open(filepath, "r") as file:
        return json.load(file)

In [4]:
df = pd.read_csv("./tweets-with-geocoding.csv")
df["CountyId"] = df["CountyId"].parallel_apply(lambda x: str(x))

In [5]:
df_clean = df.query('CountyId != "nan"')

In [6]:
df_clean.shape

(256449, 39)

In [7]:
df_clean.head()

Unnamed: 0,created_at,id,id_str,text,truncated,geo,place,quote_count,reply_count,retweet_count,favorite_count,entities,lang,timestamp_ms,user_id_str,user_name,user_screen_name,user_location,user_description,user_verified,user_followers_count,user_friends_count,user_listed_count,user_favourites_count,user_statuses_count,user_created_at,user_utc_offset,user_geo_enabled,user_lang,user_default_profile,display_text_range,extended_entities,possibly_sensitive,extended_tweet,quoted_status_id,quoted_status_id_str,quoted_status,quoted_status_permalink,CountyId
0,2020-03-19 10:59:57+00:00,1240593849694605312,1240593849694605312,current weather in Burlington: overcast clouds...,False,"[44.48, -73.21]","{'id': '8f07721fe6eb5c41', 'url': 'https://api...",0,0,0,0,"{'hashtags': [], 'urls': [], 'user_mentions': ...",en,2020-03-19 10:59:57.543,112034664,Burlington Weather,BurlingtonVT,"Burlington, VT","Weather updates, forecast, warnings and inform...",False,120.0,1.0,14,0.0,37409,2010-02-07 01:44:27+00:00,,True,,False,,,,,,,,,50007.0
1,2020-03-19 10:59:59+00:00,1240593857349218309,1240593857349218304,"Flash Flood Warning continues for Nixa MO, Oza...",False,"[36.82104014, -93.47691589]","{'id': '2526edd24c06e60c', 'url': 'https://api...",0,0,0,0,"{'hashtags': [], 'urls': [], 'user_mentions': ...",en,2020-03-19 10:59:59.368,600227042,NWS Springfield,NWSSpringfield,"Springfield, Missouri",Official Twitter account for the National Weat...,True,33227.0,306.0,512,2396.0,17173,2012-06-05 16:24:58+00:00,,True,,False,"[0, 83]","{'media': [{'id': 1240593855927300098, 'id_str...",0.0,,,,,,29209.0
2,2020-03-19 11:00:00+00:00,1240593862382489600,1240593862382489600,ee69b5dca7d2458d2039dc8f3e12c093a733e7b83d0618...,True,"[27.26026626, -166.01094411]",,0,0,0,0,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",fr,2020-03-19 11:00:00.568,3048544857,GooGuns Lulz,googuns_lulz,(here),@victor_zheng,False,259.0,1.0,257,0.0,3973912,2015-02-21 03:26:23+00:00,,True,,False,,,,{'full_text': 'ee69b5dca7d2458d2039dc8f3e12c09...,,,,,15007.0
3,2020-03-19 11:00:00+00:00,1240593862659149824,1240593862659149824,"Wind 0.0 mph NNW. Barometer 29.83 in, Rising s...",False,"[37.92583333, -120.63]","{'id': 'fbd6d2f5a4e4a15e', 'url': 'https://api...",0,0,0,0,"{'hashtags': [], 'urls': [], 'user_mentions': ...",en,2020-03-19 11:00:00.634,865368865,Donald Price,DonaldPrice11,,,False,5.0,24.0,0,14.0,8142,2012-10-06 21:52:04+00:00,,True,,True,,,,,,,,,6009.0
4,2020-03-19 11:00:02+00:00,1240593868438917126,1240593868438917120,"MED - MEDICAL at 6300 SE DIVISION ST, PORT [Po...",False,"[45.505914, -122.598196]","{'id': 'ac88a4f17a51c7fc', 'url': 'https://api...",0,0,0,0,"{'hashtags': [{'text': 'RP20000023885', 'indic...",en,2020-03-19 11:00:02.012,1606472113,Mult Co Fire/EMS log,pdxfirelog,"Multnomah County, Oregon","Unofficial automated posting of Portland, Oreg...",False,2844.0,10.0,118,0.0,538176,2013-07-19 17:57:31+00:00,,True,,False,,,,,,,,,41051.0


In [8]:
df_clean.to_csv("twittes-with-geocoding-cleaned.csv", index=False)

In [38]:
df_index = df_clean[["id", "CountyId", "user_name"]]

In [39]:
df_index.shape

(256449, 3)

In [40]:
df_index.to_csv("twitter-index.csv", index=False)

In [45]:
df_index_new = df_index["user_name"].str.split(" ", n=1, expand=True)

In [50]:
import nltk
from nltk.corpus import names

# For shuffling
import random

In [56]:
def gender_features(word):
    """ feature extractor for the name classifier
    The feature evaluated here is the last letter of a name
    feature name - "last_letter"
    """
    return {"last_letter": word[-1]}  # feature set


# Extract the data sets
labeled_names = [(name, "male") for name in names.words("male.txt")] + [
    (name, "female") for name in names.words("female.txt")
]

print(len(labeled_names))  # 7944 names

# Shuffle the names in the list
random.shuffle(labeled_names)

# Process the names through feature extractor
feature_sets = [(gender_features(n), gender) for (n, gender) in labeled_names]

# Divide the feature sets into training and test sets
train_set, test_set = feature_sets[500:], feature_sets[:500]

# Train the naiveBayes classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)

# Test out the classifier with few samples outside of training set
print(classifier.classify(gender_features("Burlington Weather")))  # returns male
print(classifier.classify(gender_features("Donald Price	")))  # returns female

# Test the accuracy of the classifier on the test data
print(nltk.classify.accuracy(classifier, test_set))  # returns 0.78 for now

# examine classifier to determine which feature is most effective for
# distinguishing the name's gender
print(classifier.show_most_informative_features(5))

7944
male
female
0.764
Most Informative Features
             last_letter = 'a'            female : male   =     34.7 : 1.0
             last_letter = 'k'              male : female =     31.5 : 1.0
             last_letter = 'p'              male : female =     18.5 : 1.0
             last_letter = 'f'              male : female =     15.8 : 1.0
             last_letter = 'm'              male : female =     10.5 : 1.0
None
