In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf

from scipy import stats
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from acquire import get_telco_data
#from prepare import prep_titanic
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Aqcuiring data
df = get_telco_data()
df.head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service_type_id,online_security,...,payment_type_id,monthly_charges,total_charges,churn,contract_type_id,contract_type,internet_service_type_id.1,internet_service_type,payment_type_id.1,payment_type
0,0003-MKNFE,Male,0,No,No,9,Yes,Yes,1,No,...,2,59.9,542.4,No,1,Month-to-month,1,DSL,2,Mailed check
1,0013-MHZWF,Female,0,No,Yes,9,Yes,No,1,No,...,4,69.4,571.45,No,1,Month-to-month,1,DSL,4,Credit card (automatic)
2,0015-UOCOJ,Female,1,No,No,7,Yes,No,1,Yes,...,1,48.2,340.35,No,1,Month-to-month,1,DSL,1,Electronic check
3,0023-HGHWL,Male,1,No,No,1,No,No phone service,1,No,...,1,25.1,25.1,Yes,1,Month-to-month,1,DSL,1,Electronic check
4,0032-PGELS,Female,0,Yes,Yes,1,No,No phone service,1,Yes,...,3,30.5,30.5,Yes,1,Month-to-month,1,DSL,3,Bank transfer (automatic)


In [None]:
# We have 7043 rows andf 27 columns of data
df.shape

In [None]:
 df = df.loc[:,~df.columns.duplicated()]

In [None]:
# We have no nulls. Data types, 1 float, 5 int, and 18 object with 24 columns
df.info()

In [None]:
# Looking at a summary of statisics
df.describe().T

In [None]:
# Seeing that max tenure is 72 months and min is 0
df.sort_values('tenure', ascending=False).head()

In [None]:
# Dropping duplicates if they exist
df.drop_duplicates(inplace=True)
df.shape

In [None]:
# Switching gender column to read male. Keeping track of male and female
df.rename(columns={'gender': 'male'}, inplace=True)
# Making males have a value of 1 and females 0
df['male'] = df['male'].replace("Male", 1)
df['male'] = df['male'].replace("Female", 0)
df.male.value_counts()

In [None]:
# Switching partner column to read partners. 
df.rename(columns={'partner': 'partners'}, inplace=True)
# Partners have a value yes/no 1/0.
df['partners'] = df['partners'].replace("Yes", 1)
df['partners'] = df['partners'].replace("No", 0)
df.partners.value_counts()

In [None]:
# Dependents column to read yes/no 1/0.
df['dependents'] = df['dependents'].replace("Yes", 1)
df['dependents'] = df['dependents'].replace("No", 0)
df.dependents.value_counts()

In [None]:
# phone_service column to read yes/no 1/0.
df['phone_service'] = df['phone_service'].replace("Yes", 1)
df['phone_service'] = df['phone_service'].replace("No", 0)
df.head()

In [None]:
# multiple_lines adding no phone service as no for multiple lines
df["multiple_lines"] = df["multiple_lines"].replace("No phone service", "No")
# Now making into yes/no 1/0
df.multiple_lines = df.multiple_lines.replace("Yes", 1)
df.multiple_lines = df.multiple_lines.replace("No", 0)
df.multiple_lines.value_counts()

In [None]:
# Had to convert No internet service to No, then online_security into yes/no 1/0.
df["online_security"] = df["online_security"].replace("No internet service", "No")
df.online_security = df.online_security.replace("Yes", 1)
df.online_security = df.online_security.replace("No", 0)
df.online_security.value_counts()

In [None]:
# Had to convert No internet service to No, then online_backup into yes/no 1/0.
df["online_backup"] = df["online_backup"].replace("No internet service", "No")
df.online_backup = df.online_backup.replace("Yes", 1)
df.online_backup = df.online_backup.replace("No", 0)

In [None]:
# Had to convert No internet service to No, then device_protection into yes/no 1/0.
df["device_protection"] = df["device_protection"].replace("No internet service", "No")
df.device_protection = df.device_protection.replace("Yes", 1)
df.device_protection = df.device_protection.replace("No", 0)

In [None]:
# Had to convert No internet service to No, then tech_support into yes/no 1/0.
df["tech_support"] = df["tech_support"].replace("No internet service", "No")
df.tech_support = df.tech_support.replace("Yes", 1)
df.tech_support = df.tech_support.replace("No", 0)

In [None]:
# Had to convert No internet service to No, then streaming_tv into yes/no 1/0.
df["streaming_tv"] = df["streaming_tv"].replace("No internet service", "No")
df.streaming_tv = df.streaming_tv.replace("Yes", 1)
df.streaming_tv = df.streaming_tv.replace("No", 0)

In [None]:
# Had to convert No internet service to No, then streaming_movies into yes/no 1/0.
df["streaming_movies"] = df["streaming_movies"].replace("No internet service", "No")
df.streaming_movies = df.streaming_movies.replace("Yes", 1)
df.streaming_movies = df.streaming_movies.replace("No", 0)

In [None]:
df.head().T

In [None]:
# Looking into the min and max charges
print(df.monthly_charges.sort_values().head())
print(df.monthly_charges.sort_values().tail())

In [None]:
# Looking into the min and max total charges. Max value is 999.9. Keep that in mind
# There are 11 entries with no monthly charge. I need to see what to do with them
print(df.total_charges.sort_values().head(15))
print(df.total_charges.sort_values().tail())

In [None]:
# Going to make $0 to retain them
df.total_charges = df.total_charges.where((df.tenure != 0), 0)
# Was getting error as the 0 values where inputed as strings. Changed them to floats
df['total_charges'] = df.total_charges.astype(float)
df.total_charges.min()
print(df.total_charges.sort_values().head(15))
print(df.total_charges.sort_values().tail(15))

In [None]:
# Churn into yes/no 1/0
df.churn = df.churn.replace("Yes", 1)
df.churn = df.churn.replace("No", 0)
df.churn.value_counts()

In [None]:
# Dropping cotract_type and renaming contract_type_id to cotract_type. 1 = Month-to-Month, 2 = 1 yr, 3 = 2 yr
df = df.drop("contract_type", axis=1)
df = df.loc[:,~df.columns.duplicated()]
#df = df.drop(df.columns[21], axis=1)

In [None]:
#df = df.rename(columns={'contract_type_id':'contract_type'})
df.head(1).T

In [None]:
# Looking at the value counts of the internet_service_types
df.internet_service_type.value_counts()

In [None]:
# Dropping internet_service_type 
df = df.drop("internet_service_type", axis=1)

In [None]:
# Renaming internet_service_type_id to internet_service_type. ***************
# 1 = DSL, 2 = Fiber Optic yr, 3 = None ***************
df = df.rename(columns={'internet_service_type_id':'internet_service_type'})

In [None]:
df.info()

In [None]:
# Going to add other_services column. It will contain online_security = 1, online_backup = 2, 
# device_protection = 3, tech_support = 4, streaming_tv = 5, streaming_movies = 6
#other_services = pd.DataFrame({"col1": ["online_security", "online_backup", "device_protection", "tech_support", "streaming_tv", "streaming_movies"], "other_services": [1,2,3,4,5,6]})
#other_services

In [None]:
# Concatinating both dataframes
#df = pd.concat([df, other_services], axis=1)
#df.head()

In [None]:
# Dropping online_security, online_backup, device_protection, tech_support, streaming_tv, streaming_movies, col1
#df = df.drop(["online_security", "online_backup", "device_protection", "tech_support", "streaming_tv", "streaming_movies", "col1"], axis=1)
#df.head()

In [None]:
# paperless_billing into yes/no 1/0
df.paperless_billing = df.paperless_billing.replace("Yes", 1)
df.paperless_billing = df.paperless_billing.replace("No", 0)
df.head(1)

In [None]:
df["paperless_billing"].hist()

In [None]:
df.head(1)

In [None]:
df["payment_type"].hist()

In [None]:
df["dependents"].hist()

In [None]:
!git status

In [None]:
#!git add -A 

In [None]:
#!git commit -m "All complete, look at issues with internet_service_type"

In [None]:
#!git push