# Leitura da Tabela

In [None]:
import bigframes.pandas as bf
df = bf.read_gbq("davidoneilPDM.compras_gold", use_cache=False)

In [2]:
df.shape

(638881, 5)

# Pré-processamento e preparação dos dados de treinamento

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score

In [4]:
labelEncoder = LabelEncoder()
df['client_id_encoded'] = labelEncoder.fit_transform(df['client_id'])

In [5]:
features = ['client_id_encoded', 'company_id', 'mes', 'ano']
target = 'numero_pedidos'

In [6]:
X = df[features]
y = df[target]

# Treinamento com Scikit-Learn

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X.to_pandas(), y.to_pandas(), test_size=0.2, random_state=42)

In [8]:
model = LinearRegression()
model.fit(X_train, y_train)
print("Modelo scikit-learn treinado com sucesso!")

Modelo scikit-learn treinado com sucesso!


In [9]:
import numpy as np

y_pred_val = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
print(f"rmse: {rmse:.2f} itens")

rmse: 9.64 itens


## Prediction com Scikit-Learn

In [10]:
import pandas as pd
df_predict_pd = pd.read_csv("gs://kaggle-pdm-2025/test.csv")
df_predict_pd['mes'] = 8
df_predict_pd['ano'] = 2025

In [11]:
df_predict_pd['client_id_encoded'] = labelEncoder.transform(df_predict_pd['client_id'])

In [12]:
X_future = df_predict_pd[features]
predictions = model.predict(X_future)

In [13]:
predictions

array([1.79675448, 1.90101105, 1.79143698, 1.79387166, 1.80159709,
       1.77781212, 1.79292855, 1.7854506 , 1.80872054, 1.89628883,
       1.88984763, 1.76240808, 1.79424623, 1.91417438, 1.78949057,
       1.79412583, 1.81910807, 1.8134762 , 1.72495143, 1.77275547,
       1.71865738, 1.80326258, 1.7652909 , 1.79860056, 1.89768677,
       1.80677414, 1.71883128, 1.81541592, 1.82012476, 1.88618223,
       1.78411287, 1.89955291, 1.75889652, 1.77984548, 1.77805291,
       1.74463624, 1.7632241 , 1.77905622, 1.79430642, 1.82308115,
       1.81274713, 1.75913062, 1.7373857 , 1.79417934, 1.79829957,
       1.81110172, 1.87766084, 1.71506555, 1.89749949, 1.7902397 ,
       1.8766174 , 1.78707595, 1.76743128, 1.89911815, 1.75804036,
       1.89759982, 1.84370238, 1.76219404, 1.7744544 , 1.90762616,
       1.90467645, 1.81289428, 1.80694135, 1.76439462, 1.77068867,
       1.7662942 , 1.76536448, 1.84633103, 1.77338421, 1.81523533,
       1.82106117, 1.90497744, 1.79102228, 1.74036216, 1.77756

In [14]:
df_predict_pd['total_pedidos_mes'] = np.maximum(0, np.round(predictions)).astype(int)

In [15]:
df_final_results = df_predict_pd[[
    'client_id', 'company_id', 'total_pedidos_mes'
]]

In [16]:
df_final_results.head()

Unnamed: 0,client_id,company_id,total_pedidos_mes
0,c39917,214,2
1,c15136,214,2
2,c40795,214,2
3,c40389,214,2
4,c39127,214,2


In [None]:
df_final_results.to_csv("gs://kaggle-pdm-2025/davidoneilPDM/sample_submission.csv", index=False)