In [25]:
%%time
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
from shapely.ops import transform

from shapely.geometry import Point, Polygon
import datetime

from io import StringIO
from pathlib import Path, PureWindowsPath
import os
import sys
from functools import partial

# Import custom functions from `scripts` folder
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from clean_tweets import geometrize_tweets, convert_shapefile_crs, find_frequencies

IndentationError: expected an indented block (clean_tweets.py, line 47)

In [16]:
def geometrize_tweets(df):
    """
    Convert DataFrame of tweets into GeoDataFrames based on lat/lon coords.

    Parameters
    ----------
    data : pd.DataFrame
        Must contain columns 'lat' and 'lon' containing lat/lon coordinates

    Returns
    -------
    gpd.geodataframe.GeoDataFrame

    """
    # Create a shapely.geometry.Point for each tweet
    geometry = [Point(xy) for xy in zip(df['location.lon'], df['location.lat'])]
    crs = {'init':'epsg:4326'}

    # Convert to GeoDataFrame, where each tweet's geometry is assigned to the lat/lon coords
    return gpd.GeoDataFrame(df, crs=crs, geometry=geometry)

In [18]:
def convert_shapefile_crs(shapefile):
    """
    Convert shapefile CRS to WGS84 (epsg:4326).
    Function may take a while to run.
    Source: https://gis.stackexchange.com/a/127432

    Parameters
    ----------
    shapefile : geopandas.GeoDataFrame

    Returns
    -------
    shapefile : geopandas.GeoDataFrame
        Contains updated 'geometry' column
    """
    in_proj = pyproj.Proj(shapefile.crs)
    out_proj = pyproj.Proj(init='epsg:4326')

    project = partial(
        pyproj.transform,
        in_proj,
        out_proj
    )
    shapefile['geometry'] = [transform(project, geom) for geom in shapefile['geometry']]

    return shapefile

In [19]:
def find_frequencies(series, pat, case=False, ratio=False):
    """
    Find the number (or ratio) of times that a pattern occurs in a list of tweets.

    Parameters
    ----------
    series : pd.Series
        Column of text containing tweets. Must be dtype

    pat : string
        Regular expression to check against `series`.

    case : boolean (optional, default=False)
        If True, comparisons are case-sensitive (e.g. 'pattern' != 'PaTtErN')
        If False, comparisons are case-insensitive. (e.g. 'pattern' == 'PaTtErN')

    ratio : boolean (optional, default=False)
        If True, return the ratio (number_of_matches) / (number_of_tweets).
        If False, return a tuple (number_of_matches, number_of_tweets).

    Returns
    -------
    integer or float

    """
    n = len(series)
    num_matches = series.str.contains(pat, case=case).sum()

    if ratio:
        return num_matches / n
    else:
        return num_matches, n

In [21]:
def assign_home_location(data, uid='u_id', SA2='SA2_5DIG16', date='date', hour='hour',
                         min_tweets=10, min_days=10, min_hours=8):
    """
    Assign a home location for Twitter users and their tweets based on following methodology:

    1. Consider tracts satisfying the following properties:
        - More than `min_tweets` tweets total
        - Sent from more than `min_days` different days
        - Sent from more than `min_hours` different hours of the day
    2. Of the remaining candidates, select the tract with the most tweets

    This function does not guarantee that all Twitter users/tweets will be assigned a home location.
    Some users will not have any tweets that meet the criteria defined above; this will result in a
    missing value (np.NaN) being assigned to the home tract for that user's tweets.


    Parameters
    ----------
    data : pd.DataFrame or gpd.geodataframe.GeoDataFrame
        DataFrame containing the following columns (variables passed into the function):
            - uid : Twitter user ID
            - sa2 : Tract identifier (e.g. tract ID, FIPS code)
            - date : Datetime object containing just the date
                     (year, month, and day; not a full timestamp)
            - hour : Integer containing 24-hour-format hour of tweet

    uid, tract, date, hour : string (optional, default='u_id', 'OBJECTID', 'date', 'hour')
        Column names to extract from `data`; additional details under `data` parameter

    min_tweets : integer (optional, default=10)
        Minimum number of tweets required from a user at a valid tract

    min_days : integer (optional, default=10)
        Minimum number of unique days a user must tweet from a valid tract

    min_hours : integer (optional, default=8)
        Minimum number of unique hours a user must tweet from a valid tract


    Returns
    -------
    pd.Series of length data.shape[0], containing a home location for each tweet.
    Note that this function is not an inplace operation.
    e.g. df['home_tract'] = assign_home_location(df)

    """
    # Note: groupby is done multiple times to save computation time
    home_locations = (
        data
        # More than min_tweets
        .groupby(['u_id', 'SA2_5DIG16']) #will need to edit this to a different code later
        .filter(lambda user_tract: len(user_tract) > min_tweets)

        # More than min_days
        .groupby(['u_id', 'SA2_5DIG16'])  #will need to edit this to a different code later
        .filter(lambda user_tract: user_tract[date].nunique() > min_days)

        # More than min_hours
        .groupby(['u_id', 'SA2_5DIG16'])  #will need to edit this to a different code later
        .filter(lambda user_tract: user_tract[hour].nunique() > min_hours)

        # Extract home location
        .groupby(['u_id', 'SA2_5DIG16'])  #will need to edit this to a different code later
        .size()
        .reset_index(name='count')
        .sort_values(by='count', ascending=False)
        .loc[:, ['u_id', 'SA2_5DIG16']]
        .groupby('u_id')
        .first()
        .loc[:, 'SA2_5DIG16']
    )

    return data[uid].map(home_locations.to_dict())


In [22]:
def summary_stats(data):
    """
    Analyze the following:
        - Number of tweets (printed output)
        - Number of unique users (printed output)
        - Median number of tweets/user (returned output)
        - Number of tweets/user at the 99th percentile (returned output)

    Parameters
    ----------
    data : pd.DataFrame or gpd.GeoDataFrame
        DataFrame containing tweets; must contain column `u_id` for user id

    Returns
    -------
    median_tweets : int or float
        Median number of tweets/user

    pct_99_tweets : int or float
        99th percentile of tweets/user
    """
    # Number of tweets
    print("{} total tweets".format(len(data)))

    # Number of unique users
    print("{} unique users\n".format(data['u_id'].nunique()))

    # Percentiles of tweets/user (median + 99th)
    pct_50_tweets, pct_99_tweets = data.groupby('u_id').size().quantile([.50, .99])
    print("Median number of tweets/user: {} tweets".format(pct_50_tweets))
    print("99th percentile of tweets/user: {} tweets".format(pct_99_tweets))

    return pct_50_tweets, pct_99_tweets
