In [274]:
import glob
from pyspark.sql import SQLContext
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer, StandardScaler
pd.options.display.max_columns = 999
try:
    sc and spark
except (NameError, UnboundLocalError) as e:

    import findspark

    findspark.init()
    import pyspark
    import pyspark.sql

    sc = pyspark.SparkContext()
    spark = pyspark.sql.SparkSession(sc).builder.appName(
        "E-Commerce Analytics").getOrCreate()
    sqlContext = SQLContext(spark)


def spark_read_parquet(path):
   return sqlContext.read.parquet(path)


base = "/Users/davidkatzaudio/Desktop/bi_ecom_project/serialized_data/"

cat_names = spark_read_parquet(base+"cat_names.parquet")
classified_orders = spark_read_parquet(base+"classified_orders.parquet")
orders_general = spark_read_parquet(base+"orders_general.parquet")
payments = spark_read_parquet(base+"payments.parquet")
products = spark_read_parquet(base+"products.parquet")
public_customers = spark_read_parquet(base+"public_customers.parquet")
sellers = spark_read_parquet(base+"sellers.parquet")
geo_table = spark_read_parquet(base+"geo_table.parquet")
orders_distance = spark_read_parquet(base+"orders_distance.parquet")

dataframes = {

    "cat_names": cat_names,
    "classified_orders": classified_orders,
    "orders_general": orders_general,
    "payments": payments,
    "products": products,
    "public_customers": public_customers,
    "sellers": sellers,
    "geo_table": geo_table,
    "orders_distance": orders_distance,
}

for item in dataframes.items():
    print("--- Creating SQL View for %s ---" % item[0])
    item[1].createOrReplaceTempView(item[0])


--- Creating SQL View for cat_names ---
--- Creating SQL View for classified_orders ---
--- Creating SQL View for orders_general ---
--- Creating SQL View for payments ---
--- Creating SQL View for products ---
--- Creating SQL View for public_customers ---
--- Creating SQL View for sellers ---
--- Creating SQL View for geo_table ---
--- Creating SQL View for orders_distance ---


In [275]:
df = spark.sql("SELECT * FROM classified_orders").toPandas().set_index("index_")

In [276]:
df["customer_city"] = df["customer_city"].str.strip().str.lower()
df["approval_delay"] = (df["order_aproved_at"] - df.order_purchase_timestamp).apply(lambda x: x.seconds)
df["order_month"] = df.order_purchase_timestamp.apply(lambda x:x.month)

In [None]:
features = [

    "order_freight_value",
    "order_items_qty",
    "product_photos_qty",
    "order_products_value",
    "product_category_name",
    "approval_delay",
    
] 

categoricals = ["customer_state", "product_category_name", "order_month"]

features = [i for i in features if i not in categoricals]
target = ["most_voted_class"]

In [278]:
df = df.dropna()
df.shape

In [279]:
encoder = OneHotEncoder()
normalizer = StandardScaler()
X_conts = pd.DataFrame(normalizer.fit_transform(X = df[features].reset_index(drop=True)), columns=features)

cats_df = pd.DataFrame(encoder.fit_transform(df[categoricals]).todense(), columns=encoder.get_feature_names())
X = pd.concat([X_conts, cats_df], axis=1)
feature_names = X.columns
y = (df[target]
     .replace("satisfeito_com_pedido", 0)
     .replace("problemas_de_entrega", 1)
     .replace("problemas_de_qualidade", 1).
     values)

(3303, 35)

In [322]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)
model = RandomForestClassifier(n_estimators=30,max_depth=105, class_weight="balanced_subsample",criterion="entropy", )
model.fit(X_train, y_train)
print("Model Score: %3f" % model.score(X_test,y_test))
print("Mathew Corr Coeff: %3f" % matthews_corrcoef(y_test, model.predict(X_test)))

Model Score: 0.667339
Mathew Corr Coeff: 0.280632


  This is separate from the ipykernel package so we can avoid doing imports until


###### 