---
# Import dependecies

In [1]:
import pandas as pd

---
# Work with data

## Preprocess data

In [2]:
df = pd.read_excel("data/train.xlsx", skiprows=5)

In [3]:
df.shape

(314, 148)

In [9]:
df.head(3)

Unnamed: 0,год,неделя,Начало нед,"Продажи, рубли","Продажи, упаковки","Продажи, рубли.1","раствор 0.01 % 150 мл N1, руб","раствор 0.01 % 50 мл N1, руб","раствор 0.01 % 500 мл N1, руб","раствор 0.01 % 150 мл N1, упаковки",...,Итого.11,"ТВ, trp\n(Ж 30-60 ВС).12","ТВ, охват 5+\n(Ж 30-60 ВС).12","ТВ, рубли.12","Диджитал, руб.12","ТВ спонсорство, рубли.12","OOH, рубли.12","Радио, рубли.13",Итого.12,Unnamed: 147
0,2019.0,1,2018-12-31,41535570.0,114621.15,0.0,0.0,0.0,0.0,0.0,...,11834250.0,207.903106,15.3615,9085528.010323,0.0,0.0,0.0,0.0,9085528.0,
1,,2,2019-01-07,51222290.0,141523.2,0.0,0.0,0.0,0.0,0.0,...,13996530.0,317.467542,25.5255,10599782.67871,0.0,0.0,0.0,0.0,10599780.0,
2,,3,2019-01-14,57127350.0,157926.3,0.0,0.0,0.0,0.0,0.0,...,11466290.0,192.816682,13.9125,10599782.67871,0.0,0.0,0.0,0.0,10599780.0,


## Split on train/test data sets

In [28]:
df_train = df.iloc[:244]
df_submission = df.iloc[244:]

## Save data

In [29]:
df_train.to_csv('data/df_train.csv', index=False)
df_submission.to_csv('data/df_submission.csv', index=False)

---
# Model train 

In [30]:
import pandas as pd

# Load the dataset into a DataFrame
data = df_train

# Get the non-numerical columns
non_numerical_columns = data.select_dtypes(exclude=['number']).columns.tolist()

# Output the non-numerical column names
print("Non-numerical columns:")
non_numeric_features = []

for col in non_numerical_columns:
    non_numeric_features.append(col)

non_numeric_features

Non-numerical columns:


['Начало нед',
 '(1)\nТВ, trp\n(Ж 30-60 ВС)',
 '(1)\nТВ, рубли',
 '(3)\nТВ, рубли',
 '(тотал)\nТВ, рубли',
 'ТВ, рубли.1',
 'ТВ Рег, рубли',
 'ТВ, trp\n(Ж 30-60 ВС).3',
 'ТВ, trp\n(Ж 30-60 ВС).5',
 'ТВ, рубли.5',
 'ТВ, рубли.6',
 'ТВ, рубли.8',
 'ТВ, рубли.11',
 'ТВ, рубли.12']

In [33]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error


# Load the dataset
data = df_train
# Split features and target variable
X = data.drop(non_numeric_features, axis=1).drop('Продажи, рубли', axis=1)
y = data['Продажи, рубли']

# X = X.fillna(0)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set and evaluate the model using MAPE
y_pred = rf_model.predict(X_test)
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f'Mean Absolute Percentage Error: {mape}')


Mean Absolute Percentage Error: 0.03890101026918674


---
# Create submission

## Load data

In [35]:
df_submission = pd.read_csv('data/df_submission.csv')
df_submission.head(3)

Unnamed: 0,год,неделя,Начало нед,"Продажи, рубли","Продажи, упаковки","Продажи, рубли.1","раствор 0.01 % 150 мл N1, руб","раствор 0.01 % 50 мл N1, руб","раствор 0.01 % 500 мл N1, руб","раствор 0.01 % 150 мл N1, упаковки",...,Итого.11,"ТВ, trp\n(Ж 30-60 ВС).12","ТВ, охват 5+\n(Ж 30-60 ВС).12","ТВ, рубли.12","Диджитал, руб.12","ТВ спонсорство, рубли.12","OOH, рубли.12","Радио, рубли.13",Итого.12,Unnamed: 147
0,,36,2023-09-04,,,,,,,,...,,,,,,,,,,
1,,37,2023-09-11,,,,,,,,...,,,,,,,,,,
2,,38,2023-09-18,,,,,,,,...,,,,,,,,,,


# Predict data

In [39]:
df_submission['Продажи, рубли'] = rf_model.predict(df_submission.drop(non_numeric_features, axis=1).drop('Продажи, рубли', axis=1))

In [42]:
df_submission = df_submission[['Начало нед', 'Продажи, рубли']]
df_submission.rename(columns={'Начало нед': 'week', 'Продажи, рубли': 'revenue'}, inplace=True)
df_submission.head(5)

Unnamed: 0,week,revenue
0,2023-09-04,59868050.0
1,2023-09-11,59868050.0
2,2023-09-18,59868050.0
3,2023-09-25,59868050.0
4,2023-10-02,59868050.0


# Save data

In [None]:
df_train.to_csv('data/submission.csv', index=False)