# Data extraction

## Load file

In [29]:
import pandas as pd

file = 'output2.csv'
path = f'./data/csv/{file}'

columns = ['type',"date",'time','duration','distance',"origin","destination",
           'total_earning','base_fare','customer_fare','paid_to_driver',
           'paid_to_uber']

# load file to df
df = pd.read_csv(path, names=columns, header=None)

In [35]:
# checking null values and duplicated values
print('Dataframe length: ', df.shape[0])
print()
print('Null values: ')
print(df.isnull().sum())
print()
print("Duplicated values: ", df.duplicated().sum())

Dataframe length:  471

Null values: 
type              28
date              28
time              28
duration          28
distance          28
origin            28
destination       28
total_earning     29
base_fare         29
customer_fare     48
paid_to_driver    48
paid_to_uber      50
dtype: int64

Duplicated values:  27


In [36]:
df1 = df.dropna(ignore_index=True).copy()
df1.drop_duplicates(inplace=True)

In [38]:
# checking null values and duplicated values
print('Dataframe length: ', df1.shape[0])
print()
print('Null values: ')
print(df1.isnull().sum())
print()
print("Duplicated values: ", df1.duplicated().sum())

Dataframe length:  421

Null values: 
type              0
date              0
time              0
duration          0
distance          0
origin            0
destination       0
total_earning     0
base_fare         0
customer_fare     0
paid_to_driver    0
paid_to_uber      0
dtype: int64

Duplicated values:  0


# Transform data

In [39]:
df2 = df1.copy()

## Change data to expected formats

### Format dates and duration

In [44]:
from dateutil import parser

def convert_date(date_str):
    return parser.parse(date_str).strftime('%Y-%m-%d')

def convert_time(time_str):
    time_str = parser.parse(time_str)
    return time_str.strftime('%H:%M:%S')

df2['date'] = df2['date'].apply(convert_date)
df2['time'] = df2['time'].apply(convert_time)
df2['datetime'] = pd.to_datetime(df2['date'] + ' '+ df2['time'])

In [45]:
import pytimeparse
from datetime import timedelta

def parse_duration(duration_str):
    duration = pytimeparse.parse(duration_str) 
    return timedelta(seconds=duration)

df2['duration_dt'] = df2['duration'].apply(parse_duration)

### Format distance

In [47]:
def get_distance(dist_str):
    dist_str = dist_str.split(' ')
    if len(dist_str) < 2:
        dist_str = 0
    else:
        dist_str = float(dist_str[0])
    return dist_str

df2['distance_km'] = df2['distance'].apply(get_distance)

### Get geolocation of origin and destination

In [48]:
import googlemaps

key = './data/places_key.txt'
with open(key, 'r') as f:
    key_str = f.read()
gmaps = googlemaps.Client(key_str)

def get_lat_lng(location_str, gmaps=gmaps):
    location = gmaps.geocode(location_str)
    try:
        location_list = list(location[0]['geometry']['location'].values())
        return location_list
    except:
        return None  

In [49]:

df2['origin_lat_lng'] = df2['origin'].apply(get_lat_lng)

In [51]:
df2['destination_lat_lng'] = df2['destination'].apply(get_lat_lng)

### Format values

In [54]:
columns_to_transform = ['total_earning', 'base_fare', 'customer_fare', 'paid_to_driver', 'paid_to_uber']
df2[columns_to_transform] = df2[columns_to_transform].apply(lambda x: x.str.replace('R\$', '', regex=True).astype(float))

In [67]:
# send df to csv to save dataframe
date = pd.Timestamp.now().strftime('%Y-%m-%d')
backup = pd.read_csv('./artifacts/saved_df_backup_2025-02-02.csv', index_col=0)
backup.to_csv(f'./artifacts/saved_df_backup_{date}.csv')


In [72]:

concat = pd.concat([backup, df2], ignore_index=True)
concat.to_csv('./artifacts/saved_df.csv')