### Import libraries

In [1]:
# Standard library imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Scikit-learn imports for preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# Fake data
from faker import Faker
fake = Faker()

### Create fake data

In [2]:
# We will create some fake data using numpy
num_companies = 10000
num_products = 30

# Create random data.
data = np.random.randint(0, 51, size=(num_companies, num_products))

# Create the dataframe
df = pd.DataFrame(data,
                  index=[f"Company{i+1}" for i in range(num_companies)],
                  columns=[f"product_{i+1}" for i in range(num_products)])

df.head()


Unnamed: 0,product_1,product_2,product_3,product_4,product_5,product_6,product_7,product_8,product_9,product_10,...,product_21,product_22,product_23,product_24,product_25,product_26,product_27,product_28,product_29,product_30
Company1,14,33,27,19,35,32,7,37,32,48,...,21,4,8,6,3,26,27,49,29,28
Company2,21,26,47,31,34,48,47,10,23,45,...,37,8,44,18,12,37,7,45,8,45
Company3,21,18,31,48,33,37,38,37,7,13,...,2,2,14,38,31,15,9,46,4,23
Company4,19,32,47,40,12,29,32,44,43,50,...,5,40,11,31,3,41,16,18,49,44
Company5,23,48,33,38,18,30,47,17,44,12,...,25,33,34,37,45,21,49,30,15,32


In [3]:
df.shape

(10000, 30)

#### Add faker

In [4]:
# Generate 10 unique industries that we can assign to the companies

unique_industries = [fake.bs() for _ in range(10)]
df['Industry'] = [fake.random_element(elements=unique_industries) for _ in range(num_companies)]

In [5]:
df['Company Size'] = [fake.random_element(elements=('Small', 'Medium', 'Large')) for _ in range(num_companies)]
df['Region'] = [fake.country() for _ in range(num_companies)]

In [6]:
df.head()

Unnamed: 0,product_1,product_2,product_3,product_4,product_5,product_6,product_7,product_8,product_9,product_10,...,product_24,product_25,product_26,product_27,product_28,product_29,product_30,Industry,Company Size,Region
Company1,14,33,27,19,35,32,7,37,32,48,...,6,3,26,27,49,29,28,syndicate best-of-breed schemas,Small,Tunisia
Company2,21,26,47,31,34,48,47,10,23,45,...,18,12,37,7,45,8,45,synergize turn-key functionalities,Large,Qatar
Company3,21,18,31,48,33,37,38,37,7,13,...,38,31,15,9,46,4,23,scale mission-critical initiatives,Medium,Haiti
Company4,19,32,47,40,12,29,32,44,43,50,...,31,3,41,16,18,49,44,syndicate best-of-breed schemas,Large,Mexico
Company5,23,48,33,38,18,30,47,17,44,12,...,37,45,21,49,30,15,32,synergize turn-key functionalities,Medium,Bouvet Island (Bouvetoya)


In [7]:
# Move the last three columns to the front
cols = df.columns.tolist() # Create a list of all of the columns
cols = cols[-3:] + cols[:-3] # Reorder the columns so that the last three are first
df = df[cols] # Reassign the dataframe to have the new column order
df.head()

Unnamed: 0,Industry,Company Size,Region,product_1,product_2,product_3,product_4,product_5,product_6,product_7,...,product_21,product_22,product_23,product_24,product_25,product_26,product_27,product_28,product_29,product_30
Company1,syndicate best-of-breed schemas,Small,Tunisia,14,33,27,19,35,32,7,...,21,4,8,6,3,26,27,49,29,28
Company2,synergize turn-key functionalities,Large,Qatar,21,26,47,31,34,48,47,...,37,8,44,18,12,37,7,45,8,45
Company3,scale mission-critical initiatives,Medium,Haiti,21,18,31,48,33,37,38,...,2,2,14,38,31,15,9,46,4,23
Company4,syndicate best-of-breed schemas,Large,Mexico,19,32,47,40,12,29,32,...,5,40,11,31,3,41,16,18,49,44
Company5,synergize turn-key functionalities,Medium,Bouvet Island (Bouvetoya),23,48,33,38,18,30,47,...,25,33,34,37,45,21,49,30,15,32


#### Add target variable

In [8]:
df['purchased_new_sku'] = np.random.randint(0, 2, size=num_companies)
df.head()

Unnamed: 0,Industry,Company Size,Region,product_1,product_2,product_3,product_4,product_5,product_6,product_7,...,product_22,product_23,product_24,product_25,product_26,product_27,product_28,product_29,product_30,purchased_new_sku
Company1,syndicate best-of-breed schemas,Small,Tunisia,14,33,27,19,35,32,7,...,4,8,6,3,26,27,49,29,28,0
Company2,synergize turn-key functionalities,Large,Qatar,21,26,47,31,34,48,47,...,8,44,18,12,37,7,45,8,45,1
Company3,scale mission-critical initiatives,Medium,Haiti,21,18,31,48,33,37,38,...,2,14,38,31,15,9,46,4,23,1
Company4,syndicate best-of-breed schemas,Large,Mexico,19,32,47,40,12,29,32,...,40,11,31,3,41,16,18,49,44,1
Company5,synergize turn-key functionalities,Medium,Bouvet Island (Bouvetoya),23,48,33,38,18,30,47,...,33,34,37,45,21,49,30,15,32,0


### Create weights

In [9]:
# Create weights for each product
final_weights = [2, 1, 5, 3, 10, 10, 5, 8, 9, 3, 5, 5, 5, 5, 5, 6, 10, 1, 1, 3, 2, 10, 9, 8, 7, 6, 5, 4, 3, 2]
assert len(final_weights) == num_products
if len(final_weights) != num_products:
    raise ValueError("Number of weights must match number of products")
else:
    print("Weights are correct")

Weights are correct


#### Calculate weighted score for each company

In [11]:
# Create a column for the total weight of each company
df['weighted_score'] = df.iloc[:, 3:33].dot(final_weights)

In [12]:
scaler = MinMaxScaler(feature_range=(0, 100))

In [13]:
normalized_score = scaler.fit_transform(df[['weighted_score']])

In [16]:
# Create a column for the normalized score
df['normalized_score'] = normalized_score.round(2)

In [17]:
df.head()

Unnamed: 0,Industry,Company Size,Region,product_1,product_2,product_3,product_4,product_5,product_6,product_7,...,product_24,product_25,product_26,product_27,product_28,product_29,product_30,purchased_new_sku,weighted_score,normalized_score
Company1,syndicate best-of-breed schemas,Small,Tunisia,14,33,27,19,35,32,7,...,6,3,26,27,49,29,28,0,3447,34.38
Company2,synergize turn-key functionalities,Large,Qatar,21,26,47,31,34,48,47,...,18,12,37,7,45,8,45,1,4204,55.33
Company3,scale mission-critical initiatives,Medium,Haiti,21,18,31,48,33,37,38,...,38,31,15,9,46,4,23,1,3959,48.55
Company4,syndicate best-of-breed schemas,Large,Mexico,19,32,47,40,12,29,32,...,31,3,41,16,18,49,44,1,4685,68.63
Company5,synergize turn-key functionalities,Medium,Bouvet Island (Bouvetoya),23,48,33,38,18,30,47,...,37,45,21,49,30,15,32,0,4720,69.6


### Build a model

We are skipping EDA because the data is ready to go for modelling.