<a href="https://colab.research.google.com/github/dutt2022/Coursera-Data-science-Projects/blob/main/xgbboost_9_targests.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'financial-performance-prediction:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F70781%2F7733789%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240307%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240307T112846Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D1276e621ede41c3556cfafa0324160b0fdff1c3f6bfd91e89b9629823a7eff58303299e4dca367e489c4d62930755f952b3be91bd5fa926715d2bdfd2a002df4636861397dd62abbb3c437e500d3b8de7ad6725a674c3168f12f80fd1e6a0d66cf0291bf9864dd6f70035c2a7efe4777d6ea06f54f79f44b166085518e79fb0957d7944433e864b9babd87344d6813320947fb6dfdaf43a197261bf95f48b52906145bab5345739b837b0f0ce679fa1c1c54c5d3b7796db098f75bff16c955e59feb5c59f687659c78ba8ada5a591cc96793e485d4ca699d4c73ddeb5d7f5dd89745c1fa9cf1c591ef59bbb7356e4fc3e930eaeaf585c218cb32118163d2d9a8'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn.model_selection import cross_val_score as cvs
from xgboost import XGBRegressor

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv('/kaggle/input/financial-performance-prediction/train.csv')
test = pd.read_csv('/kaggle/input/financial-performance-prediction/test.csv')

## Look at data shape, dtypes and unite train & test into a single DF

In [None]:
print(f'Train shape: {train.shape}')
print(f'Test shape: {test.shape}')

In [None]:
targets_names = [col for col in train if col not in test]
print(f'Target columns: {targets_names}')

In [None]:
train.dtypes.value_counts()

In [None]:
test.dtypes.value_counts()

In [None]:
# make an easy col to split back the train and test sets
train['is_train'] = True
test['is_train'] = False
df = pd.concat([train, test], axis = 0, ignore_index = True)

## Deal with string/categoric columns

In [None]:
df.select_dtypes(include = 'O')

In [None]:
for col in df.select_dtypes(include = 'O'):
    print(f'{col} unique values num: {df[col].nunique()}')
    print('-'*50)

Looks like it is best have to label encode the `industry` with 113 unique categories.
`sector` and `recommendationKey` can safely be one-hot-encoded as it will expand the dataset by only 16 columns.
And we will just binarize the `financialCurrency` column

In [None]:
def binarize(df, col):
    uniques = list(df[col].dropna().unique())
    df[col] = [1 if val == uniques[0] else 0 for val in df[col]]
    return df

In [None]:
def one_hot_encode(df, col):
    dummies = pd.get_dummies(df[col], prefix = col).astype(int)
    df.drop(col, axis = 1, inplace = True)
    df = pd.concat([df, dummies], axis = 1)
    return df

In [None]:
def factorize(df, col):
    mapping = {name: ix for ix, name in enumerate(df[col].unique())}
    df[col] = df[col].map(mapping)
    return df

In [None]:
df = binarize(df, 'financialCurrency')
print(df['financialCurrency'].head())

In [None]:
print(df.shape)
df = one_hot_encode(df, 'sector')
df = one_hot_encode(df, 'recommendationKey')
print(df.shape)

In [None]:
df = factorize(df, 'industry')
print(df.dtypes.value_counts())

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Deal with missing values

In [None]:
df.isnull().sum()

In [None]:
print('Missing data stats:\n')
under_10_percent = 0
ten_to_20_percent = 0
twenty_to_50_percent = 0
over_50_percent = 0
for col in df:
    if np.any(df[col].isnull()):
        percent_missing = np.round(100*df[col].isnull().sum()/len(df),2)
        if percent_missing < 10:
            under_10_percent+=1
        elif 10 <= percent_missing < 20:
            ten_to_20_percent+=1
        elif 20 <= percent_missing < 50:
            twenty_to_50_percent+=1
        else:
            over_50_percent+=1

print(f'ncols with under 10% missing data:\t{under_10_percent}')
print(f'ncols with 10-20% missing data:\t\t{ten_to_20_percent}')
print(f'ncols with 20-50% missing data:\t\t{twenty_to_50_percent}')
print(f'ncols with over 50% missing data:\t{over_50_percent}')

Many columns have missing data, but the majority of cols with NaNs have under 10% missing values. It's safe to fill all missing values by median.

In [None]:
# check if there are any inf values in data
df[df == np.inf].count().sum()

In [None]:
# check for missing values in the target columns
train[targets_names].isnull().sum()

In [None]:
def impute_missing_by_median(df):
    print(f'Missing values before imputation: {sum(df.isnull().sum())}')
    # first replace inf and -inf with nan
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    # then impute nan by median
    for col in df:
        if np.any(df[col].isnull()):
            df[col].fillna(df[col].median(), inplace = True)
    print(f'Missing values after imputation: {sum(df.isnull().sum())}')
    return df

In [None]:
df = impute_missing_by_median(df)

## Extract train & test without target cols and separate tartgets df with 9 target cols

In [None]:
train = df[df['is_train'] == True]
test = df[df['is_train'] == False]
train.drop('is_train', axis = 1, inplace=True)
test.drop('is_train', axis = 1, inplace=True)
test.reset_index(drop = True, inplace = True)

test.drop(targets_names, axis = 1, inplace = True)
targets = train[targets_names].copy()
train.drop(targets_names, axis = 1, inplace = True)

## Validate XGB model on train data
### Since we have 9 independent target columns, I will train 9 separate separate models on train features to predict each target.

In [None]:
targets

In [None]:
model = XGBRegressor()

In [None]:
print('Cross validation R2 scores for each target:\n')
cross_val_score_results = {}
for target in targets:
    score = np.round(np.mean(cvs(model, train, targets[target], cv=10, scoring='r2')),2)
    cross_val_score_results[target] = score
    print(f'{target} -> {score}')
print(f'\nMean R2 score across all targets: {np.mean(list(cross_val_score_results.values()))}')

Arbitrary results. EBITDA R2 is negative, but the rest are not bad. Let's try to upload a first submission.

## Train same model on the whole trains set, predict on test, save to dictionary

In [None]:
preds = {}
for target in targets:
    model.fit(train, targets[target])
    pred = model.predict(test)
    preds[target] = pred
    print(f'Finished train/predict for: {target}')

## Make submission

In [None]:
sub = pd.read_csv('/kaggle/input/financial-performance-prediction/sample_submission.csv')

In [None]:
# sanity check
sub.Id == test.Id

In [None]:
for target in preds:
    sub[target] = preds[target]
sub.head()

In [None]:
sub.to_csv('submission.csv', index = False)

In [None]:
#ids = np.array(sub.Id)
#preds.keys()

In [None]:
# Write the submission file
#np.savetxt(
#    'submission.csv',
#    np.rec.fromarrays([ids] + [pred for pred in list(preds.values())]),
#    fmt=['%s', '%d', '%d', '%d', '%d', '%d', '%d', '%d', '%d', '%d'],
#    delimiter=',',
#    header='Id,Q0_TOTAL_ASSETS,Q0_TOTAL_LIABILITIES,Q0_TOTAL_STOCKHOLDERS_EQUITY,Q0_GROSS_PROFIT,Q0_COST_OF_REVENUES,Q0_REVENUES,Q0_OPERATING_INCOME,Q0_OPERATING_EXPENSES,Q0_EBITDA',
#    comments='submission description from code',
#)