# Ciência de dados é fácil! Certo?

In [0]:
#@title Carregar base de dados (New York City Taxi Fare Prediction | Kaggle)
import pandas as pd
import numpy as np
import math
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import warnings
warnings.filterwarnings('ignore')


!wget -q https://storage.googleapis.com/taxi-fare/afl.csv
df = pd.read_csv('afl.csv') # afl.csv: Small sample from NYC Taxi 100K records (https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page)
df.head(1)

In [0]:
#@title Visualizar estatísticas da base 
df.describe(percentiles=[])

In [0]:
#@title Lat/Long igual a zero... Pode isso?
df = df[(df.pickup_longitude != 0)  & (df.pickup_latitude  != 0) &
        (df.dropoff_longitude != 0) & (df.dropoff_latitude != 0) ]
# df.describe(percentiles=[])

In [0]:
#@title Extrair características da base
def sphere_dist(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon):
    """
    Return distance along great radius between pickup and dropoff coordinates.
    """
    #Define earth radius (km)
    R_earth = 6371
    #Convert degrees to radians
    pickup_lat, pickup_lon, dropoff_lat, dropoff_lon = map(np.radians,
                                                             [pickup_lat, pickup_lon, 
                                                              dropoff_lat, dropoff_lon])
    #Compute distances along lat, lon dimensions
    dlat = dropoff_lat - pickup_lat
    dlon = dropoff_lon - pickup_lon
    
    #Compute haversine distance
    a = np.sin(dlat/2.0)**2 + np.cos(pickup_lat) * np.cos(dropoff_lat) * np.sin(dlon/2.0)**2
    
    return 2 * R_earth * np.arcsin(np.sqrt(a))

def add_airport_dist(dataset):
    """
    Return minumum distance from pickup or dropoff coordinates to each airport.
    JFK: John F. Kennedy International Airport
    EWR: Newark Liberty International Airport
    LGA: LaGuardia Airport
    """
    jfk_coord = (40.639722, -73.778889)
    ewr_coord = (40.6925, -74.168611)
    lga_coord = (40.77725, -73.872611)
    
    pickup_lat = dataset['pickup_latitude']
    dropoff_lat = dataset['dropoff_latitude']
    pickup_lon = dataset['pickup_longitude']
    dropoff_lon = dataset['dropoff_longitude']
    
    pickup_jfk = sphere_dist(pickup_lat, pickup_lon, jfk_coord[0], jfk_coord[1]) 
    dropoff_jfk = sphere_dist(jfk_coord[0], jfk_coord[1], dropoff_lat, dropoff_lon) 
    pickup_ewr = sphere_dist(pickup_lat, pickup_lon, ewr_coord[0], ewr_coord[1])
    dropoff_ewr = sphere_dist(ewr_coord[0], ewr_coord[1], dropoff_lat, dropoff_lon) 
    pickup_lga = sphere_dist(pickup_lat, pickup_lon, lga_coord[0], lga_coord[1]) 
    dropoff_lga = sphere_dist(lga_coord[0], lga_coord[1], dropoff_lat, dropoff_lon) 
    
    dataset['JFKDist'] = pd.concat([pickup_jfk, dropoff_jfk], axis=1).min(axis=1)
    dataset['EWRDist'] = pd.concat([pickup_ewr, dropoff_ewr], axis=1).min(axis=1)
    dataset['LGADist'] = pd.concat([pickup_lga, dropoff_lga], axis=1).min(axis=1)
    
    return dataset
    
def add_datetime_info(dataset):
    #Convert to datetime format
    dataset['pickup_datetime'] = pd.to_datetime(dataset['pickup_datetime'],format="%Y-%m-%d %H:%M:%S UTC")
    dataset['DotW'] = dataset.pickup_datetime.dt.weekday 
    
    return dataset

  
# Manipulate Dates
df['Day'] = df['pickup_datetime'].str.slice(8,10)
df['Month'] = df['pickup_datetime'].str.slice(5,7)
df['Year'] = df['pickup_datetime'].str.slice(0,4)
df['Hour'] = df['pickup_datetime'].str.slice(11,13)

# Manipulate distance
df['DistLat'] = np.absolute(df['pickup_latitude'] - df['dropoff_latitude'])
df['DistLon'] = np.absolute(df['pickup_longitude'] - df['dropoff_longitude'])

# Get distance!!!
df['DistTot'] = np.sqrt(((df['pickup_longitude'] - df['dropoff_longitude'])**2)+
                       ((np.absolute(df['pickup_latitude'] - df['dropoff_latitude'])**2)))

# Manipulate variable type
df['passenger_count'] = df['passenger_count'].astype('float')
df['Day'] = df['Day'].astype('float')
df['Month'] = df['Month'].astype('float')
df['Year'] = df['Year'].astype('float')
df['Hour'] = df['Hour'].astype('float')

# Too bad!!!
tooFar = (df['DistLat'] >= 2) | (df['DistLon'] >= 2)
df.drop(df[tooFar].index, inplace=True)

# Too too bad!!!
badData = ((df['passenger_count'] == 0) | (df['fare_amount'] < 0) | 
           (df['pickup_longitude'] < -84) | (df['pickup_longitude'] > -64) | 
          (df['pickup_latitude'] < 30) | (df['pickup_latitude'] > 50))
df.drop(df[badData].index, inplace=True)

# Add the day of the week and airport distance (NICE!!!)
df = add_datetime_info(df)
df = add_airport_dist(df)


print("Variáveis genuínas:")
print(list(df.columns.values)[:12])
print("Variáveis novas:")
print(list(df.columns.values)[13:])

In [0]:
#@title Treinar Modelo Preditivo 
# We want to predict fare amount (y value) by using our predictors (x value)
y = df['fare_amount']
x = df[['DistLon','DistLat','DistTot',
        'Day','Month','Year',
        'Hour','passenger_count','pickup_longitude',
        'pickup_latitude','dropoff_longitude','dropoff_latitude',
        'DotW','JFKDist','EWRDist','LGADist']]

# Split up data to train and test different parts
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=0.001, random_state=1)

# Split up data to train and validation different parts
xTrain, xVal, yTrain, yVal = train_test_split(xTrain, yTrain, test_size=0.25, random_state=10)

# Create/Fit Model
model = XGBRegressor(learning_rate= .03, colsample_bytree = .8,
                    max_depth= 8, subsample= 1,
                    objective='reg:linear', eval_metric= 'rmse',
                    early_stopping_rounds=10)

model.fit(xTrain, yTrain)

# Predict with model
predictions = model.predict(xVal)

# Test accuracy using the scoring system of Mean Absolute Error
MAE = mean_absolute_error(yVal, predictions)
print("Erro Absoluto Médio: {}".format(MAE))

In [0]:
#@title Testar Modelo
# Predict with model
test_predict = model.predict(xTest)

print("TestID: Predição (Referência)\n--")
for index, fare in enumerate(test_predict):
  print( "Test#{}: {:.1f} ({})".format(index, fare, yTest.to_numpy()[index]) )