In [1]:
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd

In [5]:
from sklearn import preprocessing

X_train = np.array([[ 1., -1.,  2.],
                 [ 2.,  0.,  0.],
                 [ 0.,  1., -1.]])

scaler = preprocessing.StandardScaler().fit(X_train)
scaler

In [9]:
scaler.mean_


array([1.        , 0.        , 0.33333333])

In [11]:
scaler.scale_

array([0.81649658, 0.81649658, 1.24721913])

In [13]:
X_scaled = scaler.transform(X_train)
X_scaled

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [15]:
print("mean:", X_scaled.mean(axis=0),", std:",  X_scaled.std(axis=0))

mean: [0. 0. 0.] , std: [1. 1. 1.]


In [17]:
X_train = np.array([[ 1., -1.,  2.],
...                     [ 2.,  0.,  0.],
...                     [ 0.,  1., -1.]])

min_max_scaler = preprocessing.MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(X_train)
X_train_minmax

array([[0.5       , 0.        , 1.        ],
       [1.        , 0.5       , 0.33333333],
       [0.        , 1.        , 0.        ]])

In [19]:
X_test = np.array([[-3., -1.,  4.]])
X_test_minmax = min_max_scaler.transform(X_test)
X_test_minmax

array([[-1.5       ,  0.        ,  1.66666667]])

In [21]:
min_max_scaler.scale_

array([0.5       , 0.5       , 0.33333333])

In [23]:
 min_max_scaler.min_

array([0.        , 0.5       , 0.33333333])

In [25]:
X_train = np.array([[ 1., -1.,  2.],
                     [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])

max_abs_scaler = preprocessing.MaxAbsScaler()
X_train_maxabs = max_abs_scaler.fit_transform(X_train)
X_train_maxabs

array([[ 0.5, -1. ,  1. ],
       [ 1. ,  0. ,  0. ],
       [ 0. ,  1. , -0.5]])

In [27]:
X_test = np.array([[ -3., -1.,  4.]])
X_test_maxabs = max_abs_scaler.transform(X_test)
X_test_maxabs

array([[-1.5, -1. ,  2. ]])

In [29]:
max_abs_scaler.scale_

array([2., 1., 2.])

In [31]:
X = [[ 1., -1.,  2.],
  [ 2.,  0.,  0.],
  [ 0.,  1., -1.]]

X_normalized = preprocessing.normalize(X, norm='l2')

X_normalized

array([[ 0.40824829, -0.40824829,  0.81649658],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.70710678, -0.70710678]])

In [39]:
enc = preprocessing.OrdinalEncoder()
X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox'], ['female', 'from Africa', 'uses Chrome']]
enc.fit(X)

In [45]:
enc.transform([['female', 'from US', 'uses Firefox']])

array([[0., 2., 1.]])

In [47]:
enc.transform([['male', 'from Europe', 'uses Chrome']])

array([[1., 1., 0.]])

In [49]:
enc = preprocessing.OrdinalEncoder()
X = [['male'], ['female'], [np.nan], ['female']]
enc.fit_transform(X)

array([[ 1.],
       [ 0.],
       [nan],
       [ 0.]])

In [51]:
enc = preprocessing.OrdinalEncoder(encoded_missing_value=-1)
X = [['male'], ['female'], [np.nan], ['female']]
enc.fit_transform(X)

array([[ 1.],
       [ 0.],
       [-1.],
       [ 0.]])

In [55]:
import pandas as pd
from sklearn import preprocessing
from sklearn.impute import SimpleImputer



# take the data form the document given
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
scaler = StandardScaler()


#take the numerical columns from the database and scalar them, then print the result
column_values = ["tenure"]
df[column_values] = scaler.fit_transform(df[column_values])

column_values2 = ["MonthlyCharges"]
df[column_values2] = scaler.fit_transform(df[column_values2])

column_values3 = ["SeniorCitizen"]
df[column_values3] = scaler.fit_transform(df[column_values3])

#The previosly step works for the columns tenure and monthlyCharges, but for TotalCharges, there are some empty spaces so we need to make some modifications
#first, convert the string to numeric values, NaN,
# then, replace the Nan values with the median ones.
#then do exactly the same as for the other collumns.

df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
imputer = SimpleImputer(strategy="median")
imputed_values = imputer.fit_transform(df[["TotalCharges"]])
scaler = RobustScaler()
df["TotalCharges"] = scaler.fit_transform(imputed_values)


enc = preprocessing.OrdinalEncoder()
X=["gender", "Partner", "Dependents", "PhoneService", "MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies", "Contract", "PaperlessBilling", "PaymentMethod", "Churn"]
df[X] = enc.fit_transform(df[X])

df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,0.0,-0.439916,1.0,0.0,-1.277445,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,2.0,-1.160323,-0.404100,0.0
1,5575-GNVDE,1.0,-0.439916,0.0,0.0,0.066327,1.0,0.0,0.0,2.0,...,2.0,0.0,0.0,0.0,1.0,0.0,3.0,-0.259629,0.145381,0.0
2,3668-QPYBK,1.0,-0.439916,0.0,0.0,-1.236724,1.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,1.0,3.0,-0.362660,-0.380964,1.0
3,7795-CFOCW,1.0,-0.439916,0.0,0.0,0.514251,0.0,1.0,0.0,2.0,...,2.0,2.0,0.0,0.0,1.0,0.0,0.0,-0.746535,0.130977,0.0
4,9237-HQITU,0.0,-0.439916,0.0,0.0,-1.236724,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.197365,-0.368111,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,1.0,-0.439916,1.0,1.0,-0.340876,1.0,2.0,0.0,2.0,...,2.0,2.0,2.0,2.0,1.0,1.0,3.0,0.665992,0.175224,0.0
7039,2234-XADUH,0.0,-0.439916,1.0,1.0,1.613701,1.0,2.0,1.0,0.0,...,2.0,0.0,2.0,2.0,1.0,1.0,1.0,1.277533,1.762637,0.0
7040,4801-JZAZL,0.0,-0.439916,1.0,1.0,-0.870241,0.0,1.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,1.0,2.0,-1.168632,-0.310552,0.0
7041,8361-LTMKD,1.0,2.273159,1.0,0.0,-1.155283,1.0,2.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,3.0,0.320338,-0.322327,1.0
