In [15]:
#Importing Necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime as dt
sns.set_style('darkgrid')
import warnings
warnings.filterwarnings('ignore')

In [16]:
#Importing the data
train=pd.read_csv('sbank/train.csv', parse_dates=['timestamp']) 

In [17]:
train.head(10) #Printing out the first 10 rows of our data

Unnamed: 0,id,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,...,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,price_doc
0,1,2011-08-20,43,27.0,4.0,,,,,,...,9,4,0,13,22,1,0,52,4,5850000
1,2,2011-08-23,34,19.0,3.0,,,,,,...,15,3,0,15,29,1,10,66,14,6000000
2,3,2011-08-27,43,29.0,2.0,,,,,,...,10,3,0,11,27,0,4,67,10,5700000
3,4,2011-09-01,89,50.0,9.0,,,,,,...,11,2,1,4,4,0,0,26,3,13100000
4,5,2011-09-05,77,77.0,4.0,,,,,,...,319,108,17,135,236,2,91,195,14,16331452
5,6,2011-09-06,67,46.0,14.0,,,,,,...,62,14,1,53,78,1,20,113,17,9100000
6,7,2011-09-08,25,14.0,10.0,,,,,,...,81,16,3,38,80,1,27,127,8,5500000
7,8,2011-09-09,44,44.0,5.0,,,,,,...,9,4,0,11,18,1,0,47,4,2000000
8,9,2011-09-10,42,27.0,5.0,,,,,,...,19,8,1,18,34,1,3,85,11,5300000
9,10,2011-09-13,36,21.0,9.0,,,,,,...,19,13,0,10,20,1,3,67,1,2000000


In [18]:
train.shape

(30471, 292)

### THE TARGET : 'Price_doc'

This is the variable to be predicted: the target variable. It is the prices of various housing units in the Russian Rubles currency.

In [19]:
#Taking the target varible
target  = train.price_doc

In [20]:
#Target disribution
target.describe()

count    3.047100e+04
mean     7.123035e+06
std      4.780111e+06
min      1.000000e+05
25%      4.740002e+06
50%      6.274411e+06
75%      8.300000e+06
max      1.111111e+08
Name: price_doc, dtype: float64

In [None]:
#Target disribution plot
plt.figure(figsize=(8,5))
sns.boxplot(target ,color='g')

<matplotlib.axes._subplots.AxesSubplot at 0x841ea10>

# The Timestamp Feature

- This feature carries the date on which each transaction of the housing units took place.


In [None]:
#Display the first 10 rows of  the timestamp feature
train.timestamp.head(10) 

In [None]:
#This code checks the number of years in the dataset. 
num_of_years =train.timestamp.dt.year.nunique() 

#This code returns the actual years
d_years  = train.timestamp.dt.year.unique()

#This code returns the number of unique dates in this dataset
unique_dates = train.timestamp.nunique()

#This code
print(f'This data is collected over --------------------------{num_of_years} years')
print(f'These years are ------------------------------------ {d_years}')  
print(f'The number of unique dates in this dataset are-------{unique_dates} dates')

In [None]:
#This code extracts the months from the timestamp 
month = train.timestamp.dt.month

#This groups the months (12 groups) and aggregate the median price of each month (Jan - Dec)
grouped_month = train.groupby(month)['price_doc'].aggregate(np.median)

#This plots the a Line plot showing the variation of price in each months
grouped_month.plot('line', figsize=(15,6))

#This labels the axis of the plot
plt.xlabel('Month of transaction')
plt.ylabel('Median Price (Rubbles)')
plt.title('Lineplot of monthly median housing price', fontweight ='bold')
plt.show()

In [None]:
#This code extracts the years from the timestamp 
years = train.timestamp.dt.year

#This groups the years (5 groups) and aggregate the median price of each year (2011 - 2015)
grouped_years = train.groupby(years)['price_doc'].aggregate(np.median)

#This plots the a Line plot showing the variation of price in each months
grouped_years.plot('line', figsize=(15,6))

#This labels the axis of the plot
plt.xlabel('Year of Transaction')
plt.ylabel('Median Price (Rubbles)')
plt.title('Lineplot of yearly Median housing price', fontweight = 'bold')
plt.show()

## THE HOUSING INTERIOR FEATURES



- These are features that describe the interior of the buildings 

In [None]:
#Selecting the features that describes the apartment
interior_features = ['full_sq', 'life_sq', 'floor', 'max_floor', 
                    'material', 'build_year', 'num_room',
                    'state', 'product_type','price_doc']

#Creating seperate dataframe for these features for easy visualisations
interior = train[interior_features]

### Bivariate/multivariate analysis between a feature or more and the target variable

Let's look at the meaning of each feature:

<code>- Full_sq feature</code>  - This represents the total square area(m) of the apartment and surroundings.  <br/> 
<code>- Liv_sq feature</code>   - This represents the square area(m) of the living room. only
<br/> 
<code>- Material feature</code> - This represents the type of material used for interior wall design.
<br/> 
<code>- Max_floor feature</code> - This represents the maximun number of floors in an apartment
<br/> 
<code>- Num_room feature</code> - This represents the number of living rooms in the apartment
<br/> 
<code>- Build_year feature</code> - This represents the year each apartment was built.
<br/> 
<code>- Product_type feature</code> - This represents the what the house is used for: Investment or Owner-Occupier
<br/> 
<code>- State feature</code> - This represents the condition of the apartment.
<br/> 


1. <code>- full_sq </code> and <code> lif_sq </code> 

In [None]:
#The 'full_sq' contains outliers which will make our plots invincible and ugly
#It also contains anomalies as it doesn't make sense for 'liv_sq' to be greater than 'full_sq'

#Hence, this code drops all values of 'full_sq' less than 'life_sq'  
interior_clean = interior[interior['full_sq']> interior['life_sq']]

#This code drops all values of 'full_sq' greater than 300.
interior_clean = interior[interior['full_sq'] < 300]

In [None]:
#This code groups 'full_sq' and aggregates the median of the price for  each values of the group
grouped_sq = interior_clean.groupby('full_sq')['price_doc'].aggregate(np.median)

#Plotting a Line plot
grouped_sq.plot('line', figsize=(15,5), colormap='rocket')
plt.xlabel('Total square Area(m)')
plt.ylabel('Median Price(Rubbles)')
plt.title('A Line plot of Total Area against median Housing Price', fontweight = 'bold')
plt.show()

2. <code>- Material </code>

In [None]:
#The frequency of each material category in the 'material' feature
interior.material.value_counts()

In [None]:
#category '3' is occurs only once which might not do our visualisations any good 

#Replacing the '3' with the value with highest frequency which is '1'
interior.material.replace(3, 1,  inplace= True)

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(interior.material)
plt.xlabel('Materials')
plt.ylabel('Frequency')
plt.title('The frequency of materials ', fontweight = 'bold')
plt.show()

In [None]:
#Average price of 'material' 
grouped_mat = interior_clean.groupby('material')['price_doc'].aggregate(np.mean)

#Plotting a BarChart

grouped_mat.sort_values(ascending =  False).plot('bar', figsize=(15,6))
plt.xlabel('Interior Materials')
plt.ylabel('Average Price(Rubbles)')
plt.title('A Bar Chart of Average price of apartments built with various materials',
          fontweight = 'bold')
plt.show()

In [None]:
plt.figure(figsize=(20,5))

#plotting a pointplot to visualize 'material' against 'price'
sns.pointplot('material', 'price_doc', data =interior)
plt.xlabel('Materials')
plt.ylabel('Average Price(Rubbles)')
plt.title('A Point plot of Average price of apartments built with various materials',
          fontweight = 'bold')
plt.show()

3. <code>- state </code>

In [None]:
#Checking the frequency of  categories of 'state' feature
interior.state.value_counts()

In [None]:
#There is an obvious outlier: '33'. This might be an entry error

#Replacing the outlier with a closer number
interior.state.replace(33, 3,  inplace= True)

In [None]:
plt.figure(figsize=(10,5))

#Plotting the frequency of 'state'
sns.countplot(interior.state)
plt.xlabel('The state of the Apartment')
plt.ylabel('Frequency')
plt.title('The Frequency of the Apartment Condition', fontweight = 'bold')
plt.show()

In [None]:
#Correcting an error in the 'state'
interior.state.replace(33, 3,  inplace= True)

#Grouping by median price
grouped = interior.groupby('state')['price_doc'].aggregate(np.median)


#Plotting Bar chart
grouped.plot('bar',figsize=(10,6), colormap='Accent_r')
plt.xlabel('State of the Apartment')
plt.ylabel('Median Prices of Apartments')
plt.title('The Median Prices of different Apartment conditions', fontweight = 'bold')
plt.show()

In [None]:
#Relationship between state and materials
plt.figure(figsize=(10,5))
sns.countplot(x = interior.state, hue = interior.material)
plt.xlabel('The state of the Apartment')
plt.ylabel('Frequency')
plt.title('The Frequency of Materials used with different Apartment Condition',
          fontweight = 'bold')
plt.show()

4. <code>- num_room </code>

In [None]:
#Checking the frequency of 'num_room'
interior.num_room.value_counts()

In [None]:
# '0' living rooms doesn't make sense, and the frequency of some values ae too small
interior_clean = interior[interior['num_room'] < 6 ]
interior_clean = interior_clean[interior_clean['num_room'] >0 ]

In [None]:
#Ploting the frequency of  the  umber of rooms
interior_clean.num_room.value_counts().plot('barh',)
plt.ylabel('Number of Rooms')
plt.xlabel('Frequency')
plt.title('The Number of Rooms in Apartment', fontweight = 'bold')
plt.show()

In [None]:
#Grouping by number of rooms by the median price
grouped = interior_clean.groupby('num_room')['price_doc'].aggregate(np.median)

#plotting a Bar chart
grouped.sort_values().plot('line', figsize=(15, 5), colormap='rainbow')
plt.xlabel('Number of Rooms')
plt.ylabel('Median price (Rubles)')
plt.title('The Plot of  the median price price of  the number of rooms in an apartment',fontweight = 'bold')
plt.show()

5. <code>- floor</code> and <code>max_floor</code>

In [None]:
#Checking how many 'floor' values are greater than 'max_floor' because those are noise
floor_noise = (interior['floor']>interior['max_floor']).sum()

print(f'The number of noise in the "floor" feature is {floor_noise}')

In [None]:
#Taking floor values less than or equal tothe max_floor
interior_clean = interior[interior['floor'] <= interior['max_floor']]

In [None]:
#Grouping by number of rooms by the median price
grouped_floor = interior_clean.groupby('floor')['price_doc'].aggregate(np.mean)

#plotting a Bar chart
grouped_floor.sort_values().plot('bar', figsize=(15, 5), colormap='Greens_r')
plt.xlabel('Number of Floors')
plt.ylabel('Median price (Rubles)')
plt.title('The Plot of  the median price price of  the number of rooms in an apartment', fontweight = 'bold')
plt.show()

6. <code>- build_year</code> 

In [None]:
#Boxplot to checkfo  outliers
sns.boxplot(interior.build_year)

In [None]:
#Dropping outliers for clear visualization
interior_clean = interior[interior['build_year'] >1900]
interior_clean = interior_clean[interior_clean['build_year'] <3000]

In [None]:
#Grouping by number of rooms by the median price
grouped_year = interior_clean.groupby('build_year')['price_doc'].aggregate(np.mean)

#plotting a Bar chart
grouped_year.plot('line', figsize=(15, 5), colormap='rainbow')
plt.xlabel('year apartments were built')
plt.ylabel('Average price (Rubles)')
plt.title('A line plot of  the median price of years apartments were built', fontweight = 'bold')
plt.show()

In [None]:
#Visualizing relationships between 'build_year','material' and the 'price'
plt.figure(figsize=(15,6))
sns.scatterplot('build_year', 'price_doc','material', data = interior_clean, palette='rainbow',legend ='full')
plt.xlabel('Year apartment was built')
plt.ylabel('Price(Rubles)')
plt.title('The  Price of Apartment based on year built',fontweight = 'bold' )

plt.show()


In [None]:
#Visualizing relationships between 'build_year','state' and the 'price'
plt.figure(figsize=(15,6))
sns.scatterplot('build_year', 'price_doc','state', data = interior_clean, palette='rainbow_r',legend ='full')
plt.xlabel('Apartment age (Years)')
plt.ylabel('Average Price(Rubles)')
plt.title('The Average Price of Apartment based on Age', fontweight = 'bold')

plt.show()


7. <code>- Product_type feaure </code>

In [None]:
#Setting the size of the plot
plt.figure(figsize=(8,6))

#Plotting a Pie-Chart
interior.product_type.value_counts().plot(kind = 'pie',
                                          explode =(0.1, 0.0),
                                         autopct = '%1.1f%%')
plt.title('A Pie-Chart of the Product type', fontweight = 'bold')
plt.show()

In [None]:
#Setting the size of the plot
plt.figure(figsize=(8,6))

#A Barplot to show average price per product type
sns.barplot('product_type', 'price_doc', data = interior)
plt.xlabel('Product type')
plt.ylabel('Average Price (Rubles)')
plt.title('The Average price of Apartment based on Product type', fontweight = 'bold')
plt.show()


In [None]:
#Visualizing relationships between 'build_year','product' and the 'price'
sns.set_style('white')
plt.figure(figsize=(8,6))
sns.scatterplot('build_year', 'price_doc','product_type', data = interior_clean, palette='rainbow',legend ='full')
plt.xlabel('Apartment age (Years)')
plt.ylabel('Average Price(Rubles)')
plt.title('The Average Price of Apartment based on Age ', fontweight = 'bold')

plt.show()


In [None]:
#Visualizing relationships between 'product_type','floor'
plt.figure(figsize=(8,6))
sns.barplot('product_type', 'floor', data = interior)
plt.xlabel('Apartment type')
plt.ylabel('Number of rooms (m)')
plt.title('The Average Total Square Area of Apartment based on Type', fontweight = 'bold')
plt.show()


In [None]:
#Visualizing relationships between 'product_type','state'
plt.figure(figsize=(8,6))
sns.countplot('product_type', hue='state', data = interior)
plt.xlabel('Apartment type')
plt.ylabel('Frequency')
plt.title('The Distribution of state of Apartment based on Type', fontweight = 'bold')
plt.show()


In [None]:
#Visualizing relationships between 'material' and the 'product_type'
plt.figure(figsize=(8,6))
sns.countplot('product_type', hue='material', data = interior)
plt.xlabel('Apartment type')
plt.ylabel('Total Square area (m)')
plt.title('The Distribution of state of Apartment based on Type', fontweight = 'bold')
plt.show()
