In [1]:
import numpy as np
import pandas as pd
import tweepy, json
from itertools import zip_longest

pd.set_option('max_colwidth',280)

## Load Twitter data

In [2]:
# open file with scraped tweet ids
with open('all_ids.json', 'r') as json_file:
    tweet_ids = json.load(json_file)

In [3]:
# authorize and initialize API
access_token = ''
access_token_secret = ''
consumer_key = ''
consumer_secret = ''

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)

In [4]:
def grouper(iterable, n, fillvalue=None):
    """Collect data into fixed-length chunks or blocks
    grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"""
    args = [iter(iterable)] * n
    return zip_longest(*args, fillvalue=fillvalue)

In [7]:
# iterate over 100-id chunks and get date and text for each tweet id
tweet_dates = []
tweet_texts = []
for chunk in grouper(tweet_ids, 100):
    statuses = api.statuses_lookup(chunk, tweet_mode='extended')
    tweet_dates.extend([status.created_at for status in statuses])
    tweet_texts.extend([status.full_text for status in statuses])

In [14]:
# combine id, date and text into a DataFrame
tweets_dict = {'id':tweet_ids, 'date':tweet_dates, 'text':tweet_texts}
tweets = pd.DataFrame(tweets_dict)

In [15]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15831 entries, 0 to 15830
Data columns (total 3 columns):
id      15831 non-null object
date    15831 non-null datetime64[ns]
text    15831 non-null object
dtypes: datetime64[ns](1), object(2)
memory usage: 371.2+ KB


## Traffic violations

In [2]:
#df = pd.read_json('https://data.cityofnewyork.us/resource/nc67-uf89.json?$$app_token=zfhS7w7BeYBgPjVp9Dy8VNadO&$limit=999999999&violation=BIKE+LANE')

In [5]:
query = ("https://data.cityofnewyork.us/resource/nc67-uf89.json?"
        "$$app_token=zfhS7w7BeYBgPjVp9Dy8VNadO"
        "&$where=issue_date%20like%20'%252019%25'"
        "&$limit=999999999")

In [10]:
df = pd.read_csv('Open_Parking_and_Camera_Violations.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [15]:
df.dtypes

Plate                   object
State                   object
License Type            object
Summons Number           int64
Issue Date              object
Violation Time          object
Violation               object
Judgment Entry Date     object
Fine Amount            float64
Penalty Amount         float64
Interest Amount        float64
Reduction Amount       float64
Payment Amount         float64
Amount Due             float64
Precinct               float64
County                  object
Issuing Agency          object
Violation Status        object
Summons Image           object
dtype: object

In [None]:
# import data
violations = pd.read_json(query, convert_dates=False)

In [59]:
violations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 19 columns):
plate                  5000 non-null object
state                  5000 non-null object
license_type           5000 non-null object
summons_number         5000 non-null int64
issue_date             5000 non-null object
violation_time         5000 non-null object
violation              5000 non-null object
fine_amount            5000 non-null int64
penalty_amount         5000 non-null int64
interest_amount        5000 non-null float64
reduction_amount       5000 non-null float64
payment_amount         5000 non-null float64
amount_due             5000 non-null float64
precinct               5000 non-null int64
county                 4949 non-null object
issuing_agency         5000 non-null object
summons_image          5000 non-null object
violation_status       496 non-null object
judgment_entry_date    500 non-null object
dtypes: float64(4), int64(4), object(11)
memory usage: 742.

In [60]:
# convert time to appropriate format
violations['violation_time'] = pd.to_datetime(violations.violation_time + 'M').dt.time
# convert date to datetime
violations['issue_date'] = pd.to_datetime(violations.issue_date, format='%m/%d/%Y')
# make new column with date and time
violations['issue_date_time'] = violations.apply(lambda row : pd.datetime.combine(row['issue_date'], row['violation_time']),1)
# drop old columns
violations.drop(['issue_date', 'violation_time'], axis=1, inplace=True)

In [63]:
violations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 18 columns):
plate                  5000 non-null object
state                  5000 non-null object
license_type           5000 non-null object
summons_number         5000 non-null int64
violation              5000 non-null object
fine_amount            5000 non-null int64
penalty_amount         5000 non-null int64
interest_amount        5000 non-null float64
reduction_amount       5000 non-null float64
payment_amount         5000 non-null float64
amount_due             5000 non-null float64
precinct               5000 non-null int64
county                 4949 non-null object
issuing_agency         5000 non-null object
summons_image          5000 non-null object
violation_status       496 non-null object
judgment_entry_date    500 non-null object
issue_date_time        5000 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(4), int64(4), object(9)
memory usage: 703.2+ KB
