In [30]:
import mysql.connector
import pandas as pd
from sqlalchemy import create_engine
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso, LogisticRegression

In [31]:
engine = create_engine('mysql+mysqlconnector://root:toor@localhost:3306/telco_churn')

In [32]:
tables = pd.read_sql_query("SHOW TABLES", engine)
tables.head()

Unnamed: 0,Tables_in_telco_churn
0,billing_table
1,customer_table
2,security_table
3,service_table
4,streaming_table


In [33]:
customer_data = pd.read_sql_query("SELECT * FROM customer_table", engine)
billing_data = pd.read_sql_query("SELECT * FROM billing_table", engine)
service_data = pd.read_sql_query("SELECT * FROM service_table", engine)
streaming_data = pd.read_sql_query("SELECT * FROM streaming_table", engine)
security_data = pd.read_sql_query("SELECT * FROM security_table", engine)

In [34]:
customer_billing= pd.merge(customer_data, billing_data, on='customerid')

In [35]:
customer_billing.sample(5)

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,id,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
7590,1925-GMVBW,Female,0,No,No,1519,Month-to-month,No,Credit card (automatic),20.55,96.1,No
19378,2403-BCASL,Male,1,Yes,Yes,3876,One year,Yes,Electronic check,111.95,4534.9,Yes
32948,3118-UHVVQ,Female,0,Yes,No,6590,Two year,No,Credit card (automatic),25.8,1911.5,No
13729,4826-XTSOH,Male,1,Yes,No,2746,Month-to-month,Yes,Electronic check,86.05,86.05,Yes
26943,5515-IDEJJ,Male,0,Yes,Yes,5389,Month-to-month,No,Mailed check,19.9,19.9,Yes


In [36]:
customer_billing = customer_billing.drop(['customerid', 'id'], axis=1)

<!DOCTYPE html>
<html>
<head>
    <title>Column Value Transformation</title>
</head>
<body>
    <h1>Transforming Column Values</h1>
    <p>We are trying to change the 'Yes'/'No' values for various columns to either '1' or '0'. This operation is commonly performed in data preprocessing to convert categorical data into a format that can be used by machine learning algorithms.</p>
</body>
</html>

In [37]:
customer_billing.sample(5)

Unnamed: 0,gender,seniorcitizen,partner,dependents,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
19176,Female,0,Yes,Yes,One year,No,Mailed check,19.9,533.5,No
9778,Female,0,Yes,Yes,Month-to-month,Yes,Bank transfer (automatic),95.4,4613.95,No
23206,Female,0,No,No,Month-to-month,No,Mailed check,24.4,24.4,No
1420,Male,0,Yes,Yes,Month-to-month,No,Mailed check,95.15,1779.95,Yes
32851,Female,0,No,No,Month-to-month,Yes,Mailed check,29.15,110.05,No


In [39]:
customer_billing["churn"] = customer_billing["churn"].apply(lambda x: 1 if x == 'Yes' else 0)
customer_billing["paperlessbilling"] = customer_billing["paperlessbilling"].apply(lambda x: 1 if x == 'Yes' else 0)
customer_billing["partner"] = customer_billing["partner"].apply(lambda x: 1 if x == 'Yes' else 0)
customer_billing["dependents"] = customer_billing["dependents"].apply(lambda x: 1 if x == 'Yes' else 0)
customer_billing.sample(5)

Unnamed: 0,gender,seniorcitizen,partner,dependents,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
13410,Male,0,0,0,Month-to-month,0,Mailed check,50.4,206.6,0
26524,Male,1,1,0,Month-to-month,0,Bank transfer (automatic),70.45,849.1,0
12623,Male,0,0,1,Two year,0,Credit card (automatic),19.9,1529.65,0
27790,Female,0,1,1,Two year,1,Credit card (automatic),116.4,8543.25,0
15493,Male,1,1,0,Month-to-month,1,Mailed check,79.65,2365.15,0


<!DOCTYPE html>
<html>
<head>
    <title>Data Preprocessing dummies</title>
</head>
<body>
    <h1>Data Preprocessing for Machine Learning</h1>
    <p>In the current project, we are dealing with a dataset that contains categorical data. Categorical data is a type of data that can take on one of a limited number of categories. For example, in our dataset, the 'contract' and 'paymentmethod' columns contain various categories.</p>
    <p>Most machine learning algorithms require numerical input and output variables. So, we need to convert these categorical data into a numerical format. One common technique for this conversion is called one-hot encoding.</p>
    <p>In pandas, the <code>get_dummies</code> function is used to convert categorical variable(s) into dummy/indicator variables. For each unique value in the categorical column, it creates a new column that represents whether the record has that value. If the record has that value, it will be 1, otherwise it will be 0.</p>
    <p>Here's how we can apply this in our project:</p>
    <pre>
    <code>
    customer_billing = pd.get_dummies(customer_billing, columns=['contract', 'paymentmethod'], drop_first=True)
    </code>
    </pre>
    <p>The <code>drop_first=True</code> argument is used to avoid the dummy variable trap, which is a scenario in which the independent variables are multicollinear.</p>
</body>
</html>

In [40]:
customer_billing = pd.get_dummies(customer_billing, columns=['contract', 'paymentmethod'], drop_first=True)

In [42]:
customer_billing.sample(100)

Unnamed: 0,gender,seniorcitizen,partner,dependents,paperlessbilling,monthlycharges,totalcharges,churn,contract_One year,contract_Two year,paymentmethod_Credit card (automatic),paymentmethod_Electronic check,paymentmethod_Mailed check
17499,Female,0,0,0,0,20.90,20.90,0,False,False,False,False,True
18726,Male,0,0,0,1,94.25,669.00,0,False,False,False,True,False
14850,Male,1,0,0,1,70.25,331.90,0,False,False,False,False,True
18198,Female,1,1,0,1,100.50,6029.00,0,True,False,True,False,False
29717,Male,0,0,0,1,84.85,84.85,0,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
22734,Female,0,1,1,1,84.50,916.90,0,False,False,False,True,False
32594,Male,0,0,0,0,45.30,45.30,0,False,False,False,True,False
8230,Female,0,1,1,0,65.50,3801.30,0,True,False,False,False,True
18438,Male,0,1,1,0,100.00,1888.65,0,True,False,False,False,False
