In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from google.colab import files
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/diogenesjusto/FIAP/master/SHIFT/desafio/baba/BABA.csv", )

In [None]:
df['date'] = pd.to_datetime(df['date'].astype(str), format='%Y%m%d')

In [None]:
print(df['date'].head())

0   2014-08-01
1   2014-08-02
2   2014-08-03
3   2014-08-04
4   2014-08-05
Name: date, dtype: datetime64[ns]


In [None]:
df

Unnamed: 0.1,Unnamed: 0,X,date,mes,weekday,margem,venda,desconto,outdesc,outmg
0,1,1,2014-08-01,agosto,sexta-feira,0.406111,110042.460,9190.907534,0,0
1,2,2,2014-08-02,agosto,sabado,0.416022,58377.320,5713.043012,0,0
2,3,3,2014-08-03,agosto,domingo,0.431993,64635.390,8621.708915,0,0
3,4,4,2014-08-04,agosto,segunda-feira,0.409216,140417.321,18312.965640,0,0
4,5,5,2014-08-05,agosto,terca-feira,0.449648,149700.286,19942.741300,0,0
...,...,...,...,...,...,...,...,...,...,...
391,392,392,2015-08-27,agosto,quinta-feira,0.451562,,5846.133551,0,0
392,393,393,2015-08-28,agosto,sexta-feira,0.472608,,2657.847261,0,0
393,394,394,2015-08-29,agosto,sabado,0.385612,,269.530000,0,0
394,395,395,2015-08-30,agosto,domingo,0.495615,,3434.838118,0,0


In [None]:
import plotly.express as px
fig = px.line(df, x='date', y='venda', title='Vendas ao longo do tempo')
fig.show()

In [None]:
df_mes = df.groupby('mes')['venda'].sum().reset_index()
fig = px.bar(df_mes, x='mes', y='venda', title='Venda acumulada por mês')
fig.show()

In [None]:
df_weekday = df.groupby('weekday')['venda'].mean().reset_index()
fig = px.bar(df_weekday, x='weekday', y='venda', title='Venda média por dia da semana')
fig.show()

In [None]:
#Margem maior gera mais vendas? Gráfico:
fig = px.scatter(df, x='margem', y='venda', title='Margem vs Venda')
fig.show()

In [None]:
fig = px.scatter(df, x='date', y='venda', title='Vendas ao longo do tempo (detecção visual de outliers)')
fig.add_traces(px.line(df, x='date', y='venda').data)
fig.show()

In [None]:
fig = px.histogram(df, x='venda', nbins=50, title='Distribuição das Vendas')
fig.show()

In [None]:
fig = px.scatter(df, x='margem', y='venda', title='Margem vs Venda com possíveis outliers')
fig.show()

In [None]:
limite = df['venda'].quantile(0.95)  # Top 5% como exemplo
df_outliers = df[df['venda'] > limite]

fig = px.scatter(df, x='date', y='venda', title='Outliers de Vendas')
fig.add_traces(px.scatter(df_outliers, x='date', y='venda', color_discrete_sequence=['red']).data)
fig.show()

In [None]:
# Para mês
mes_map = {
    'janeiro': 1, 'fevereiro': 2, 'março': 3, 'abril': 4,
    'maio': 5, 'junho': 6, 'julho': 7, 'agosto': 8,
    'setembro': 9, 'outubro': 10, 'novembro': 11, 'dezembro': 12
}
df['mes_num'] = df['mes'].map(mes_map)

# Para weekday (exemplo em português)
weekday_map = {
    'segunda': 0, 'terça': 1, 'quarta': 2, 'quinta': 3,
    'sexta': 4, 'sábado': 5, 'domingo': 6
}
df['weekday_num'] = df['weekday'].map(weekday_map)

In [None]:
# Remove linhas com venda ausente
df_clean = df.dropna(subset=['venda'])

# Gera as features
X = pd.get_dummies(df_clean[['mes', 'weekday', 'margem']], drop_first=True)
y = df_clean['venda']

# Split e modelo
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("MSE:", mean_squared_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))

MSE: 3593315425.8225145
R2 Score: 0.15547123438972577


In [None]:
result_df = pd.DataFrame({
    'Real': y_test,
    'Previsto': y_pred
})

fig = px.scatter(result_df, x='Real', y='Previsto', title='Venda Real vs Venda Prevista')
fig.add_shape(type='line', x0=result_df['Real'].min(), y0=result_df['Real'].min(),
              x1=result_df['Real'].max(), y1=result_df['Real'].max(), line=dict(color='red'))
fig.show()

In [None]:
result_df = X_test.copy()
result_df['Real'] = y_test
result_df['Previsto'] = y_pred
result_df['date'] = df_clean.loc[y_test.index, 'date']

result_df = result_df.sort_values('date')

import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x=result_df['date'], y=result_df['Real'], mode='lines+markers', name='Real'))
fig.add_trace(go.Scatter(x=result_df['date'], y=result_df['Previsto'], mode='lines+markers', name='Previsto'))

fig.update_layout(title='Vendas Reais e Previstas no tempo')
fig.show()

In [None]:
result_df['residuo'] = result_df['Real'] - result_df['Previsto']
fig = px.scatter(result_df, x='Previsto', y='residuo', title='Resíduos do Modelo')
fig.add_shape(type='line', x0=result_df['Previsto'].min(), y0=0,
              x1=result_df['Previsto'].max(), y1=0, line=dict(color='red'))
fig.show()

In [None]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor(max_depth=5, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [34]:
importancias = pd.Series(model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
fig = px.bar(importancias, title="Importância das Features na Árvore de Decisão")
fig.show()

In [None]:
from sklearn.tree import export_graphviz
import graphviz

# Exportar para formato DOT
dot_data = export_graphviz(
    model,
    feature_names=X_train.columns,
    filled=True,
    rounded=True,
    special_characters=True
)

# Criar visualização
graph = graphviz.Source(dot_data)
graph.render("tree_decision")  # Gera tree_decision.pdf
graph.view("tree_decision")    # Abre no visualizador padrão

'tree_decision.pdf'

In [35]:
from sklearn.tree import export_text

rules = export_text(model, feature_names=list(X_train.columns))
print(rules)

|--- weekday_sabado <= 0.50
|   |--- mes_agosto <= 0.50
|   |   |--- weekday_segunda-feira <= 0.50
|   |   |   |--- mes_maio <= 0.50
|   |   |   |   |--- mes_junho <= 0.50
|   |   |   |   |   |--- value: [83131.98]
|   |   |   |   |--- mes_junho >  0.50
|   |   |   |   |   |--- value: [54738.56]
|   |   |   |--- mes_maio >  0.50
|   |   |   |   |--- weekday_terca-feira <= 0.50
|   |   |   |   |   |--- value: [44561.04]
|   |   |   |   |--- weekday_terca-feira >  0.50
|   |   |   |   |   |--- value: [72540.64]
|   |   |--- weekday_segunda-feira >  0.50
|   |   |   |--- mes_julho <= 0.50
|   |   |   |   |--- mes_junho <= 0.50
|   |   |   |   |   |--- value: [116110.35]
|   |   |   |   |--- mes_junho >  0.50
|   |   |   |   |   |--- value: [65011.91]
|   |   |   |--- mes_julho >  0.50
|   |   |   |   |--- margem <= 0.42
|   |   |   |   |   |--- value: [46634.63]
|   |   |   |   |--- margem >  0.42
|   |   |   |   |   |--- value: [61044.88]
|   |--- mes_agosto >  0.50
|   |   |--- weekday_