In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


DATA_URL = "https://raw.githubusercontent.com/Yorko/mlcourse.ai/main/data/"

df = pd.read_csv(DATA_URL + "telecom_churn.csv")
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   State                   3333 non-null   object 
 1   Account length          3333 non-null   int64  
 2   Area code               3333 non-null   int64  
 3   International plan      3333 non-null   object 
 4   Voice mail plan         3333 non-null   object 
 5   Number vmail messages   3333 non-null   int64  
 6   Total day minutes       3333 non-null   float64
 7   Total day calls         3333 non-null   int64  
 8   Total day charge        3333 non-null   float64
 9   Total eve minutes       3333 non-null   float64
 10  Total eve calls         3333 non-null   int64  
 11  Total eve charge        3333 non-null   float64
 12  Total night minutes     3333 non-null   float64
 13  Total night calls       3333 non-null   int64  
 14  Total night charge      3333 non-null   

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [2]:
print(df.shape)
print(f"Columns of data: {df.columns}")
columns_dict = {
    "int64": df.select_dtypes(include='int64').columns,
    "float64": df.select_dtypes(include='float64').columns,
    "object": df.select_dtypes(include='object').columns,
    "bool": df.select_dtypes(include='bool').columns
}

columns_dict

(3333, 20)
Columns of data: Index(['State', 'Account length', 'Area code', 'International plan',
       'Voice mail plan', 'Number vmail messages', 'Total day minutes',
       'Total day calls', 'Total day charge', 'Total eve minutes',
       'Total eve calls', 'Total eve charge', 'Total night minutes',
       'Total night calls', 'Total night charge', 'Total intl minutes',
       'Total intl calls', 'Total intl charge', 'Customer service calls',
       'Churn'],
      dtype='object')


{'int64': Index(['Account length', 'Area code', 'Number vmail messages',
        'Total day calls', 'Total eve calls', 'Total night calls',
        'Total intl calls', 'Customer service calls'],
       dtype='object'),
 'float64': Index(['Total day minutes', 'Total day charge', 'Total eve minutes',
        'Total eve charge', 'Total night minutes', 'Total night charge',
        'Total intl minutes', 'Total intl charge'],
       dtype='object'),
 'object': Index(['State', 'International plan', 'Voice mail plan'], dtype='object'),
 'bool': Index(['Churn'], dtype='object')}

In [3]:
# lets convert datatype of `churn` column into int
df['Churn'] = df['Churn'].astype('int64')
df.sample(5)

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
220,WA,81,415,No,No,0,183.6,116,31.21,152.6,98,12.97,212.2,99,9.55,12.2,6,3.29,3,0
1805,CO,63,415,No,No,0,211.8,84,36.01,230.9,137,19.63,217.1,99,9.77,10.7,9,2.89,3,0
250,OR,33,415,No,Yes,29,157.4,99,26.76,117.9,80,10.02,279.2,79,12.56,13.9,11,3.75,4,1
3059,NV,116,510,No,Yes,35,118.0,103,20.06,167.2,106,14.21,205.7,102,9.26,11.8,2,3.19,2,0
1643,WI,107,408,No,No,0,134.0,104,22.78,174.5,94,14.83,311.1,79,14.0,7.3,3,1.97,3,0


In [4]:
df.dtypes

State                      object
Account length              int64
Area code                   int64
International plan         object
Voice mail plan            object
Number vmail messages       int64
Total day minutes         float64
Total day calls             int64
Total day charge          float64
Total eve minutes         float64
Total eve calls             int64
Total eve charge          float64
Total night minutes       float64
Total night calls           int64
Total night charge        float64
Total intl minutes        float64
Total intl calls            int64
Total intl charge         float64
Customer service calls      int64
Churn                       int64
dtype: object

In [5]:
df["Churn"].value_counts(normalize=True, sort=True)

0    0.855086
1    0.144914
Name: Churn, dtype: float64

## Sorting
A `DataFrame` can be sorted by the value of one of the variables (i.e columns). For example, we can sort by Total day charge (use `ascending=False` to sort in descending order):

In [6]:
df.sort_values(by='Total day charge', ascending=False).sample(5)

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
1072,MO,164,408,No,Yes,25,219.1,88,37.25,151.5,99,12.88,50.1,60,2.25,14.3,6,3.86,1,0
2086,WY,146,408,No,No,0,109.0,69,18.53,265.8,98,22.59,228.3,80,10.27,12.6,2,3.4,1,0
1843,NV,105,408,No,No,0,166.1,93,28.24,175.9,106,14.95,243.5,55,10.96,16.2,3,4.37,2,0
1180,SC,72,415,No,No,0,207.8,92,35.33,195.7,110,16.63,184.8,124,8.32,13.1,4,3.54,0,0
1742,HI,79,415,No,No,0,41.9,124,7.12,211.0,95,17.94,237.9,55,10.71,11.4,5,3.08,1,0


In [7]:
df.sort_values(by=["Churn", "Total day charge"], ascending=[True, False]).sample(5)

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
2912,WY,151,415,No,No,0,170.2,89,28.93,187.5,83,15.94,119.5,100,5.38,4.3,3,1.16,0,0
2934,DC,24,408,No,No,0,149.0,73,25.33,131.0,81,11.14,238.6,69,10.74,8.6,3,2.32,2,1
1466,UT,195,415,No,No,0,63.2,108,10.74,220.2,88,18.72,184.0,99,8.28,5.1,3,1.38,0,0
1348,VT,59,408,No,No,0,151.8,98,25.81,209.9,92,17.84,266.9,86,12.01,11.9,5,3.21,1,0
2396,WY,127,510,Yes,No,0,247.5,99,42.08,108.5,118,9.22,232.0,72,10.44,10.6,3,2.86,2,0


**Boolean** indexing with one column is also very convenient. The syntax is `df[P(df['Name'])]`, where `P` is some logical condition that is checked for each element of the `Name` column. The result of such indexing is the DataFrame consisting only of the rows that satisfy the `P` condition on the `Name` column.

Let’s use it to answer the question:

What are the average values of numerical features for churned users?

Here we’l resort to an additional method `select_dtypes` to select all numeric columns.

In [8]:
df.select_dtypes(include=np.number)[df["Churn"] == 1].mean()

Account length            102.664596
Area code                 437.817805
Number vmail messages       5.115942
Total day minutes         206.914079
Total day calls           101.335404
Total day charge           35.175921
Total eve minutes         212.410145
Total eve calls           100.561077
Total eve charge           18.054969
Total night minutes       205.231677
Total night calls         100.399586
Total night charge          9.235528
Total intl minutes         10.700000
Total intl calls            4.163561
Total intl charge           2.889545
Customer service calls      2.229814
Churn                       1.000000
dtype: float64

In [9]:
df[df["Churn"] == 1]["Total day minutes"].mean()

206.91407867494814

In [10]:
df[(df["Churn"] == 0) & (df["International plan"] == "No")]["Total intl minutes"].max()

18.9

In [11]:
df.loc[0:5, "State":"Area code"]

Unnamed: 0,State,Account length,Area code
0,KS,128,415
1,OH,107,415
2,NJ,137,415
3,OH,84,408
4,OK,75,415
5,AL,118,510


In [12]:
df.iloc[0:6, 0:3]

Unnamed: 0,State,Account length,Area code
0,KS,128,415
1,OH,107,415
2,NJ,137,415
3,OH,84,408
4,OK,75,415
5,AL,118,510


In [13]:
df.iloc[-1:,:]

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
3332,TN,74,415,No,Yes,25,234.4,113,39.85,265.9,82,22.6,241.4,77,10.86,13.7,4,3.7,0,0


## Applying Functions to Cells, Columns and Rows
To apply functions to each columns, use `apply()`

In [14]:
df.apply(np.max)

State                        WY
Account length              243
Area code                   510
International plan          Yes
Voice mail plan             Yes
Number vmail messages        51
Total day minutes         350.8
Total day calls             165
Total day charge          59.64
Total eve minutes         363.7
Total eve calls             170
Total eve charge          30.91
Total night minutes       395.0
Total night calls           175
Total night charge        17.77
Total intl minutes         20.0
Total intl calls             20
Total intl charge           5.4
Customer service calls        9
Churn                         1
dtype: object

In [15]:
df[df['State'].apply(lambda state: state[0] == "W")].head()

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
9,WV,141,415,Yes,Yes,37,258.6,84,43.96,222.0,111,18.87,326.4,97,14.69,11.2,5,3.02,0,0
26,WY,57,408,No,Yes,39,213.0,115,36.21,191.1,112,16.24,182.7,115,8.22,9.5,3,2.57,0,0
44,WI,64,510,No,No,0,154.0,67,26.18,225.8,118,19.19,265.3,86,11.94,3.5,3,0.95,1,0
49,WY,97,415,No,Yes,24,133.2,135,22.64,217.2,58,18.46,70.6,79,3.18,11.0,3,2.97,1,0
54,WY,87,415,No,No,0,151.0,83,25.67,219.7,116,18.67,203.9,127,9.18,9.7,3,2.62,5,1


In [16]:
d = {'N0':False, 'Yes':True}
df['International plan'] = df['International plan'].map(d)

## Grouping
In general, grouping data in Pandas works as follows:
```
df.groupby(by=grouping_columns)[columns_to_show].function()
```
First, the `groupby` method divides the `grouping_columns` by their values. They become a new index in the resulting dataframe.

Then, columns of interest are selected (`columns_to_show`). If `columns_to_show` is not included, all non groupby clauses will be included.

Finally, one or several functions are applied to the obtained groups per selected columns.

Here is an example where we group the data according to the values of the `Churn` variable and display statistics of three columns in each group:

In [20]:
colums_to_show = ["Total day minutes", "Total eve minutes", "Total night minutes"]
df.groupby(['Churn'])[colums_to_show].median()

Unnamed: 0_level_0,Total day minutes,Total eve minutes,Total night minutes
Churn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,177.2,199.6,200.25
1,217.6,211.3,204.8


In [21]:
df.groupby(['Churn'])[colums_to_show].agg(['mean', 'std', 'min', 'max'])

Unnamed: 0_level_0,Total day minutes,Total day minutes,Total day minutes,Total day minutes,Total eve minutes,Total eve minutes,Total eve minutes,Total eve minutes,Total night minutes,Total night minutes,Total night minutes,Total night minutes
Unnamed: 0_level_1,mean,std,min,max,mean,std,min,max,mean,std,min,max
Churn,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
0,175.175754,50.181655,0.0,315.6,199.043298,50.292175,0.0,361.8,200.133193,51.105032,23.2,395.0
1,206.914079,68.997792,0.0,350.8,212.410145,51.72891,70.9,363.7,205.231677,47.132825,47.4,354.9


## Summary tables
Suppose we want to see how the observations in our dataset are distributed in the context of two variables – `Churn` and `International plan`. To do so, we can build a contingency table using the `crosstab` method:

In [22]:
pd.crosstab(df['Churn'], df['International plan'])

International plan,True
Churn,Unnamed: 1_level_1
0,186
1,137


In [23]:
pd.crosstab(df['Churn'], df['Voice mail plan'], normalize=True)

Voice mail plan,No,Yes
Churn,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.60246,0.252625
1,0.120912,0.024002


In [24]:
df.pivot_table(['Total day calls', 'Total eve calls', 'Total night calls'],
               ['Area code'],
               aggfunc='mean')

Unnamed: 0_level_0,Total day calls,Total eve calls,Total night calls
Area code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
408,100.49642,99.788783,99.039379
415,100.576435,100.503927,100.398187
510,100.097619,99.671429,100.60119
