In [40]:
import pandas as pd
import numpy as np


In [47]:


def open_and_filter(dataset):
    
    # Las date van directo como lista como parámetro para parse_dates
    date_columns = [
                    'Account_Created_Date', 'Opportunity_Created_Date',
                    'Quote_Expiry_Date', 'Last_Modified_Date',
                    'Planned_Delivery_Start_Date','Planned_Delivery_End_Date',
                    ]
    
    # Columnas categóricas
    categorical_columns = [
        'Region', 'Territory', 'Bureaucratic_Code',
        'Source ', 'Billing_Country', 'Account_Name',
        'Opportunity_Name', 'Account_Owner', 'Opportunity_Owner',
        'Account_Type', 'Opportunity_Type', 'Quote_Type',
        'Delivery_Terms', 'Brand', 'Product_Type',
        'Size', 'Product_Category_B', 'Currency',
        'Last_Modified_By', 'Product_Family', 'Product_Name',
        'ASP_Currency', 'ASP_(converted)_Currency', 'Delivery_Quarter',
        'Total_Amount_Currency', 'Total_Taxable_Amount_Currency', 'Stage',
        'Prod_Category_A'
    ]
    
    column_types = { col:'category' for col in categorical_columns }
    
    # Columnnas excluidas porque tienen igual valor en todos sus registros
    empty = ['Actual_Delivery_Date', 'Last_Activity',
             'Submitted_for_Approval','Prod_Category_A']
    
    # read_csv
    df = pd.read_csv(dataset, parse_dates=date_columns, dtype=column_types,
                     index_col='ID', na_values=['Other', 'NaT', 'None'],
                     usecols=lambda x: x not in empty)
    
    # Re-typing
    df['Sales_Contract_No'] = df['Sales_Contract_No'].fillna(0).astype(np.int64)
    df['Month'] = pd.to_datetime(df['Month'], format='%Y - %m')
    
    # Agruping regions 
    df.loc[((df.Region == "EMEA")&(df.Territory.str.contains("America"))), "Region"] = "Americas"

    return df
    


Unnamed: 0_level_0,Region,Territory,"Pricing, Delivery_Terms_Quote_Appr","Pricing, Delivery_Terms_Approved",Bureaucratic_Code_0_Approval,Bureaucratic_Code_0_Approved,Bureaucratic_Code,Account_Created_Date,Source,Billing_Country,...,Planned_Delivery_End_Date,Month,Delivery_Quarter,Delivery_Year,TRF,Total_Amount_Currency,Total_Amount,Total_Taxable_Amount_Currency,Total_Taxable_Amount,Stage
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
27761,EMEA,,1,1,1,1,Bureaucratic_Code_4,2015-06-16,,Netherlands,...,2016-06-30,2016-05-01,Q2,2016,10,EUR,5272800.00,EUR,5272800.0,Closed Lost
27760,EMEA,,0,0,0,0,Bureaucratic_Code_4,2015-06-16,,Netherlands,...,2016-01-20,2016-01-01,Q1,2016,0,EUR,48230.00,EUR,48230.0,Closed Won
27446,Americas,NW America,0,0,0,0,Bureaucratic_Code_4,2015-04-21,Source_7,United States,...,2016-01-25,2016-01-01,Q1,2016,0,USD,83865.60,USD,83865.6,Closed Won
16808,Americas,NW America,1,0,1,0,Bureaucratic_Code_5,2013-07-27,Source_11,United States,...,2018-03-31,2018-02-01,Q1,2018,14,USD,7421881.50,USD,7421881.5,Closed Lost
16805,Americas,NW America,1,0,1,0,Bureaucratic_Code_5,2013-07-27,Source_11,United States,...,2018-02-28,2018-02-01,Q1,2018,25,USD,13357192.50,USD,13357192.5,Closed Lost
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8781,EMEA,Austria,1,1,1,1,Bureaucratic_Code_4,2016-01-15,Source_7,Austria,...,2016-03-25,2016-03-01,Q1,2016,0,EUR,103350.00,EUR,299715.0,Closed Won
8786,EMEA,Austria,1,1,1,1,Bureaucratic_Code_4,2016-01-15,Source_7,Austria,...,2016-04-08,2016-04-01,Q2,2016,0,EUR,93015.00,EUR,299715.0,Closed Won
8792,EMEA,Austria,1,1,1,1,Bureaucratic_Code_4,2016-01-15,Source_7,Austria,...,2016-03-31,2016-03-01,Q1,2016,0,EUR,103350.00,EUR,299715.0,Closed Won
28561,Americas,NE America,1,1,1,1,Bureaucratic_Code_4,2015-10-20,,United States,...,2016-04-29,2016-04-01,Q2,2016,4,USD,2346796.88,USD,0.0,Closed Lost


# Pre-procesamiento de los datos

El objetivo de esta etapa, es recibir los datos "crudos" y realizar procedimientos necesarios para filtrar features de poco valor y crear otros features que revelen información de importancia, para que los modelos de machine learning que luego los utilizarán en una etapa posterior, puedan ralizar un predicción mas precisa.

In [48]:
trainset = open_and_filter("data/Train_TP2_Datos_2020-2C.csv")
testset = open_and_filter("data/Test_TP2_Datos_2020-2C.csv")

In [76]:
print(f"The train set has {trainset.shape[0]} elements and {trainset.shape[1]} features")
print(f"The train set has {testset.shape[0]} elements and {testset.shape[1]} features")

The train set has 16947 elements and 47 features
The train set has 2551 elements and 46 features


In [51]:
path = "datasets/"

trainset.to_csv(path+"train.csv", index = False)
testset.to_csv(path+"test.csv", index = False)