In [1]:
import pandas as pd
import numpy as np
import geopy.distance

In [3]:
df = pd.read_csv("./ecobicidata/ecobici_mar_resume.csv")
name_of_files = ["jan","feb","mar","apr","may","jun","jul"]
fulldf = pd.concat([pd.read_csv(f"./ecobicidata/ecobici_{x}.csv") for i,x in enumerate(name_of_files)])

In [4]:
def transform_df(ene = fulldf):
    ene["full_date_retiro"] = pd.to_datetime(ene["Fecha_Retiro"] + " " + ene["Hora_Retiro"], format="%d/%m/%Y %H:%M:%S").copy()
    ene["full_date_aribo"] = pd.to_datetime(ene["Fecha_Arribo"] + " " + ene["Hora_Arribo"], format="%d/%m/%Y %H:%M:%S").copy()
    ene["Mes"] = ene["full_date_retiro"].dt.month
    ene["Hora"] = ene["full_date_retiro"].dt.hour
    ene["time_delta"] = round((ene["full_date_aribo"]  - ene["full_date_retiro"]) / np.timedelta64(1,"m"),2)
    ene["Ciclo_Estacion_Retiro"]= ene[["Ciclo_Estacion_Retiro"]].astype(str)
    ene["Ciclo_Estacion_Retiro"] = [i[:-2] for i in ene["Ciclo_Estacion_Retiro"]]
    ene["Bici"]= ene[["Bici"]].astype(str)
    ene["Bici"] = [i[:-2] for i in ene["Bici"]]
    ene["Ciclo_EstacionArribo"] = ene["Ciclo_EstacionArribo"].astype(str)
    ene["viaje"] = ene["Ciclo_Estacion_Retiro"].astype(str)+"-"+ene["Ciclo_EstacionArribo"]
    ene["Genero_Usuario"] = ene["Genero_Usuario"].fillna("X")
    ene = ene.dropna(axis=0).copy()
    return ene

df = transform_df()

In [5]:
def estaciones_df():
    estaciones = pd.read_csv("./ecobicidata/estaciones-de-ecobici.csv")[["id","name","districtcode","districtname","location_lat","location_lon","stationtype","punto_geo"]]
    estaciones["Ciclo_Estacion_Retiro"] = estaciones["id"].astype("str").copy()
    estaciones["Ciclo_EstacionArribo"] = estaciones["id"].astype("str").copy()
    estaciones_retiro = estaciones.iloc[:,[-2,1,2,3,4,5,6,7]].rename(columns={"name":"name_retiro","districtcode":"districtcode_retiro","districtname":"districtname_retiro","location_lat":"location_lat_retiro","location_lon":"location_lon_retiro","stationtype":"stationtype_retiro","punto_geo":"punto_geo_retiro"}).copy().iloc[:,[0,1,4,5,7]]
    estaciones_arribo = estaciones.iloc[:,[-1,1,2,3,4,5,6,7]].rename(columns={"name":"name_arribo","districtcode":"districtcode_arribo","districtname":"districtname_arribo","location_lat":"location_lat_arribo","location_lon":"location_lon_arribo","stationtype":"stationtype_arribo","punto_geo":"punto_geo_arribo"}).copy().iloc[:,[0,1,4,5,7]]
    return estaciones_retiro, estaciones_arribo

estaciones_retiro, estaciones_arribo = estaciones_df()

In [6]:
def mergingfiles(month, er, ea):
    first = month.merge(er, on="Ciclo_Estacion_Retiro", how="left").merge(ea, on="Ciclo_EstacionArribo", how="left")
    return first

exportfileI = mergingfiles(month=df, er=estaciones_retiro, ea=estaciones_arribo)

In [18]:
def filetoexport(first):
    location_lat_retiro = first["location_lat_retiro"].fillna('19.412182').to_list()
    location_lon_retiro = first["location_lon_retiro"].fillna('19.412182').to_list()
    location_lat_arribo = first["location_lat_arribo"].fillna('19.412182').to_list()
    location_lon_arribo = first["location_lon_arribo"].fillna('19.412182').to_list()

    distances = pd.DataFrame({"location_dist":[geopy.distance.distance((location_lat_retiro[i],location_lon_retiro[i]), (location_lat_arribo[i],location_lon_arribo[i])).km for i in range(len(location_lon_arribo))]})
    
    l = pd.concat([first, distances], axis=1, join="inner")
    l["Genero_Usuario"] = l["Genero_Usuario"].fillna("X")
    return l

lI = filetoexport(first=exportfileI.sample(350000))

In [9]:
efile = exportfileI.replace("",np.nan).dropna(axis=0)

In [72]:
top_bike = efile[["Bici"]].groupby("Bici").size().reset_index().sort_values(by=0, ascending=False)["Bici"].to_list()[0]
bike = efile.loc[ efile["Bici"]==top_bike]

In [90]:
# Top Bike
def top_bike(efile=efile):
    top_bike = efile[["Bici"]].groupby("Bici").size().reset_index().sort_values(by=0, ascending=False)["Bici"].to_list()[0]
    bike = efile.loc[ efile["Bici"]==top_bike]
    trips_per_month = bike[["Mes"]].groupby("Mes").size().reset_index().set_index("Mes").rename(columns={0:"Trips"}).transpose().to_dict()
    
    location_lat_retiro = bike["location_lat_retiro"].fillna('19.412182').to_list()
    location_lon_retiro = bike["location_lon_retiro"].fillna('19.412182').to_list()
    location_lat_arribo = bike["location_lat_arribo"].fillna('19.412182').to_list()
    location_lon_arribo = bike["location_lon_arribo"].fillna('19.412182').to_list()

    distances = pd.DataFrame({"location_dist":[geopy.distance.distance((location_lat_retiro[i],location_lon_retiro[i]), (location_lat_arribo[i],location_lon_arribo[i])).km for i in range(len(location_lon_arribo))]})
    
    last_trips = bike.sort_values(by="Fecha_Retiro").tail(100)[["Fecha_Retiro","viaje","location_lat_retiro", "location_lon_retiro","location_lat_arribo","location_lon_arribo"]].reset_index()
    trips_dict = last_trips.transpose().to_dict()

    lastbiketrips = [{
    "location_lat_retiro":trips_dict[i]["location_lat_retiro"],
    "location_lon_retiro":trips_dict[i]["location_lon_retiro"],
    "location_lat_arribo":trips_dict[i]["location_lat_arribo"],
    "location_lon_arribo":trips_dict[i]["location_lon_arribo"],
    } for i in range(len(trips_dict))]

    topBike={
        "trips_per_moth":trips_per_month,
        "total_km":int(distances.sum()),
        "last_trips":lastbiketrips
    }

    return topBike

topBike = top_bike()    

In [241]:
listofhundredtrips = efile[["name_arribo","name_retiro","viaje"]].groupby(["name_arribo","name_retiro","viaje"]).size().reset_index().set_index("viaje").sort_values(by=0,ascending=False).head(100).reset_index()

last_dict = listofhundredtrips.transpose().to_dict()

finaltrips = [{
    "name_arribo":last_dict[i]["name_arribo"],
    "name_retiro":last_dict[i]["name_retiro"],
    "viaje":last_dict[i][0]
    } for i in range(len(last_dict))]

{'location_lat_retiro': 19.391323,
 'location_lon_retiro': -99.15936,
 'location_lat_arribo': 19.391323,
 'location_lon_arribo': -99.15936}

In [157]:
all_yearsample = pd.read_csv("./ecobicidata/allyear.csv", index_col=0)

In [270]:
full_year_dataI={
    "trips_per_month":efile[["Mes"]].groupby("Mes").size().reset_index().set_index("Mes").rename(columns={0:"Trips"}).transpose().to_dict(),
    "age_distribution":{
        "age":efile[["Edad_Usuario"]].groupby("Edad_Usuario").size().reset_index().set_index("Edad_Usuario").rename(columns={0:"Trips"}).reset_index()["Edad_Usuario"].to_list(),
        "trips":efile[["Edad_Usuario"]].groupby("Edad_Usuario").size().reset_index().set_index("Edad_Usuario").rename(columns={0:"Trips"}).reset_index()["Trips"].to_list()
    },
    "median_trip_time":efile["time_delta"].median(),
    "total_trips":len(efile),
    "median_trips_per_bike":efile[["Bici"]].groupby("Bici").size().reset_index()[0].median(),
    "top_100_trips":{
        "name_arribo":efile[["name_arribo","name_retiro","viaje"]].groupby(["name_arribo","name_retiro","viaje"]).size().reset_index().set_index("viaje").sort_values(by=0,ascending=False).head(100)["name_arribo"].to_list(),
        "name_retiro":efile[["name_arribo","name_retiro","viaje"]].groupby(["name_arribo","name_retiro","viaje"]).size().reset_index().set_index("viaje").sort_values(by=0,ascending=False).head(100)["name_retiro"].to_list(),
        "num_of_trips":efile[["name_arribo","name_retiro","viaje"]].groupby(["name_arribo","name_retiro","viaje"]).size().reset_index().set_index("viaje").sort_values(by=0,ascending=False).head(100)[0].to_list(),
        "dict_of_trips":finaltrips
    },
    "avg_km_per_trip":round(all_yearsample["location_dist"].median(),2),
    "avg_time_per_trip_per_year":all_yearsample[["Mes","time_delta"]].groupby("Mes").median().reset_index().set_index("Mes").transpose().to_dict(),
    "avg_time_delta":all_yearsample[["Mes","time_delta"]].groupby("Mes").median().reset_index().set_index("Mes")["time_delta"].median(),
    "topBike":topBike
}

In [51]:
import json
# with open("./app/sample5.json", "w") as outfile:
#     json.dump(full_year_dataI, outfile)

In [1]:
efile[["Mes"]].groupby("Mes").size().reset_index().set_index("Mes").rename(columns={0:"Trips"}).transpose().to_dict()

NameError: name 'efile' is not defined

In [53]:
top_station_trips = efile.loc[efile['Ciclo_Estacion_Retiro']=='1'][['location_lat_retiro','location_lon_retiro','location_lat_arribo','location_lon_arribo']]
tripsofstation = top_station_trips.sample(100).reset_index().transpose().to_dict()

topStationDict = [{
    "location_lat_retiro": tripsofstation[i]['location_lat_retiro'],
    "location_lon_retiro": tripsofstation[i]['location_lon_retiro'],
    "location_lat_arribo": tripsofstation[i]['location_lat_arribo'],
    "location_lon_arribo": tripsofstation[i]['location_lon_arribo'],
}  for i in range(len(tripsofstation))]

with open("./app/topstation1.json", "w") as outfile:
    json.dump(topStationDict, outfile)