In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install pyspark --quiet

[K     |████████████████████████████████| 281.4 MB 46 kB/s 
[K     |████████████████████████████████| 199 kB 54.2 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


### <b>1. Obtaining the Data</b>
This stage refers to collecting data.

In [102]:
import pyspark
import numpy as np
import random
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from pyspark import SparkConf
from pyspark.sql import Window
import pyspark.sql.functions as F
from pyspark.ml.feature import VectorAssembler, StandardScaler, OneHotEncoder, StringIndexer
from pyspark.ml.classification import LogisticRegression, GBTClassifier, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

In [103]:
train_path="/content/drive/MyDrive/Freelance/Credit score prediction/train.csv"
test_path="/content/drive/MyDrive/Freelance/Credit score prediction/test.csv"

In [104]:
# lOADING THE DATASET
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","
df = spark.read.load(train_path,format='csv',header='true', inferSchema='true')
type(df)

pyspark.sql.dataframe.DataFrame

In [105]:
data_test=spark.read.load(test_path,format='csv',header='true', inferSchema='true')

In [106]:
df.printSchema()

root
 |-- ID: string (nullable = true)
 |-- Customer_ID: string (nullable = true)
 |-- Month: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- SSN: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- Annual_Income: string (nullable = true)
 |-- Monthly_Inhand_Salary: double (nullable = true)
 |-- Num_Bank_Accounts: integer (nullable = true)
 |-- Num_Credit_Card: integer (nullable = true)
 |-- Interest_Rate: integer (nullable = true)
 |-- Num_of_Loan: string (nullable = true)
 |-- Type_of_Loan: string (nullable = true)
 |-- Delay_from_due_date: integer (nullable = true)
 |-- Num_of_Delayed_Payment: string (nullable = true)
 |-- Changed_Credit_Limit: string (nullable = true)
 |-- Num_Credit_Inquiries: double (nullable = true)
 |-- Credit_Mix: string (nullable = true)
 |-- Outstanding_Debt: string (nullable = true)
 |-- Credit_Utilization_Ratio: double (nullable = true)
 |-- Credit_History_Age: string (nullable = true

In [107]:
df.head()

Row(ID='0x1602', Customer_ID='CUS_0xd40', Month='January', Name='Aaron Maashoh', Age='23', SSN='821-00-0265', Occupation='Scientist', Annual_Income='19114.12', Monthly_Inhand_Salary=1824.8433333333328, Num_Bank_Accounts=3, Num_Credit_Card=4, Interest_Rate=3, Num_of_Loan='4', Type_of_Loan='Auto Loan, Credit-Builder Loan, Personal Loan, and Home Equity Loan', Delay_from_due_date=3, Num_of_Delayed_Payment='7', Changed_Credit_Limit='11.27', Num_Credit_Inquiries=4.0, Credit_Mix='_', Outstanding_Debt='809.98', Credit_Utilization_Ratio=26.822619623699016, Credit_History_Age='22 Years and 1 Months', Payment_of_Min_Amount='No', Total_EMI_per_month=49.57494921489417, Amount_invested_monthly='80.41529543900253', Payment_Behaviour='High_spent_Small_value_payments', Monthly_Balance='312.49408867943663', Credit_Score='Good')

In [108]:
# inorder to get the functionality of data.shape as we used in pandas we can do in this way over here
print('Without any conversion: ',(df.count(), len(df.columns)))
# or if the dataset is small we can convert this into pandas dataframe and then use the functionallity of pandas
# we can observe that both of them will return same value
import pandas as pd    
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
pandasDF=df.toPandas()
print('By converting from spark to Pandas: ',pandasDF.shape)

Without any conversion:  (100000, 28)
By converting from spark to Pandas:  (100000, 28)


In [109]:
df.columns

['ID',
 'Customer_ID',
 'Month',
 'Name',
 'Age',
 'SSN',
 'Occupation',
 'Annual_Income',
 'Monthly_Inhand_Salary',
 'Num_Bank_Accounts',
 'Num_Credit_Card',
 'Interest_Rate',
 'Num_of_Loan',
 'Type_of_Loan',
 'Delay_from_due_date',
 'Num_of_Delayed_Payment',
 'Changed_Credit_Limit',
 'Num_Credit_Inquiries',
 'Credit_Mix',
 'Outstanding_Debt',
 'Credit_Utilization_Ratio',
 'Credit_History_Age',
 'Payment_of_Min_Amount',
 'Total_EMI_per_month',
 'Amount_invested_monthly',
 'Payment_Behaviour',
 'Monthly_Balance',
 'Credit_Score']

In [110]:
df.dtypes

[('ID', 'string'),
 ('Customer_ID', 'string'),
 ('Month', 'string'),
 ('Name', 'string'),
 ('Age', 'string'),
 ('SSN', 'string'),
 ('Occupation', 'string'),
 ('Annual_Income', 'string'),
 ('Monthly_Inhand_Salary', 'double'),
 ('Num_Bank_Accounts', 'int'),
 ('Num_Credit_Card', 'int'),
 ('Interest_Rate', 'int'),
 ('Num_of_Loan', 'string'),
 ('Type_of_Loan', 'string'),
 ('Delay_from_due_date', 'int'),
 ('Num_of_Delayed_Payment', 'string'),
 ('Changed_Credit_Limit', 'string'),
 ('Num_Credit_Inquiries', 'double'),
 ('Credit_Mix', 'string'),
 ('Outstanding_Debt', 'string'),
 ('Credit_Utilization_Ratio', 'double'),
 ('Credit_History_Age', 'string'),
 ('Payment_of_Min_Amount', 'string'),
 ('Total_EMI_per_month', 'double'),
 ('Amount_invested_monthly', 'string'),
 ('Payment_Behaviour', 'string'),
 ('Monthly_Balance', 'string'),
 ('Credit_Score', 'string')]

In [111]:
df=df.withColumn("Age",df.Age.cast('int'))
df.Annual_Income = df.withColumn("Annual_Income",df.Annual_Income.cast('float'))
df.Num_of_Loan = df.withColumn("Num_of_Loan",df.Num_of_Loan.cast('int'))
df.Num_of_Delayed_Payment = df.withColumn("Num_of_Delayed_Payment",df.Num_of_Delayed_Payment.cast('float'))
df.Changed_Credit_Limit = df.withColumn("Changed_Credit_Limit",df.Changed_Credit_Limit.cast('float'))
df.Outstanding_Debt = df.withColumn("Outstanding_Debt",df.Outstanding_Debt.cast('float'))
df.Amount_invested_monthly = df.withColumn("Amount_invested_monthly",df.Amount_invested_monthly.cast('float'))
df.Monthly_Balance = df.withColumn("Monthly_Balance",df.Monthly_Balance.cast('float'))

In [112]:
df.dtypes

[('ID', 'string'),
 ('Customer_ID', 'string'),
 ('Month', 'string'),
 ('Name', 'string'),
 ('Age', 'int'),
 ('SSN', 'string'),
 ('Occupation', 'string'),
 ('Annual_Income', 'string'),
 ('Monthly_Inhand_Salary', 'double'),
 ('Num_Bank_Accounts', 'int'),
 ('Num_Credit_Card', 'int'),
 ('Interest_Rate', 'int'),
 ('Num_of_Loan', 'string'),
 ('Type_of_Loan', 'string'),
 ('Delay_from_due_date', 'int'),
 ('Num_of_Delayed_Payment', 'string'),
 ('Changed_Credit_Limit', 'string'),
 ('Num_Credit_Inquiries', 'double'),
 ('Credit_Mix', 'string'),
 ('Outstanding_Debt', 'string'),
 ('Credit_Utilization_Ratio', 'double'),
 ('Credit_History_Age', 'string'),
 ('Payment_of_Min_Amount', 'string'),
 ('Total_EMI_per_month', 'double'),
 ('Amount_invested_monthly', 'string'),
 ('Payment_Behaviour', 'string'),
 ('Monthly_Balance', 'string'),
 ('Credit_Score', 'string')]

###<b>2. Scrubbing the Data</b>
Scrubbing the data typically involves missing value imputation, data type conversion, standardization, and renaming columns.

In [113]:
# removing unwanted characters
temp = df.toPandas().applymap(lambda x: x if x is np.NaN or not isinstance(x, str) else str(x).strip('_ ,"')).replace(['', 'nan', '!@9#%8', '#F%$D@*&8'], np.NaN)
df=spark.createDataFrame(temp) 

####<b>2.1. Imputing Missing Values

##### <u>2.1.1. checking for the count of null values in all columns

In [114]:
for i in df.columns:
  print("missing in",i,df.filter(df[i].isNull()).count())

missing in ID 0
missing in Customer_ID 0
missing in Month 0
missing in Name 9985
missing in Age 4939
missing in SSN 5572
missing in Occupation 7062
missing in Annual_Income 0
missing in Monthly_Inhand_Salary 15002
missing in Num_Bank_Accounts 0
missing in Num_Credit_Card 0
missing in Interest_Rate 0
missing in Num_of_Loan 0
missing in Type_of_Loan 11408
missing in Delay_from_due_date 0
missing in Num_of_Delayed_Payment 7002
missing in Changed_Credit_Limit 2091
missing in Num_Credit_Inquiries 1965
missing in Credit_Mix 20195
missing in Outstanding_Debt 0
missing in Credit_Utilization_Ratio 0
missing in Credit_History_Age 0
missing in Payment_of_Min_Amount 0
missing in Total_EMI_per_month 0
missing in Amount_invested_monthly 4479
missing in Payment_Behaviour 7600
missing in Monthly_Balance 1200
missing in Credit_Score 0


In [115]:
for i in df.columns:
  if df.filter(df[i].isNull()).count()>0:
    print(i)

Name
Age
SSN
Occupation
Monthly_Inhand_Salary
Type_of_Loan
Num_of_Delayed_Payment
Changed_Credit_Limit
Num_Credit_Inquiries
Credit_Mix
Amount_invested_monthly
Payment_Behaviour
Monthly_Balance


##### <u>2.1.2. Checking the rows which contains the null values

In [116]:
# Here we will be checking the null values present in each column
for i in df.columns:
  print("missing in",i,df.filter(df[i].isNull()).show())

+---+-----------+-----+----+---+---+----------+-------------+---------------------+-----------------+---------------+-------------+-----------+------------+-------------------+----------------------+--------------------+--------------------+----------+----------------+------------------------+------------------+---------------------+-------------------+-----------------------+-----------------+---------------+------------+
| ID|Customer_ID|Month|Name|Age|SSN|Occupation|Annual_Income|Monthly_Inhand_Salary|Num_Bank_Accounts|Num_Credit_Card|Interest_Rate|Num_of_Loan|Type_of_Loan|Delay_from_due_date|Num_of_Delayed_Payment|Changed_Credit_Limit|Num_Credit_Inquiries|Credit_Mix|Outstanding_Debt|Credit_Utilization_Ratio|Credit_History_Age|Payment_of_Min_Amount|Total_EMI_per_month|Amount_invested_monthly|Payment_Behaviour|Monthly_Balance|Credit_Score|
+---+-----------+-----+----+---+---+----------+-------------+---------------------+-----------------+---------------+-------------+-----------+---

##### <u>2.1.3. Describe the columns with missing values


Now, it seems that we have missing values for Name
Monthly_Inhand_Salary,
Type_of_Loan,
Num_of_Delayed_Payment,
Num_Credit_Inquiries,
Amount_invested_monthly,
Monthly_Balance.So we will use describe to explore the data in the columns

In [117]:
# inspecting the numerical columns with missing values
df.select(['Monthly_Inhand_Salary','Num_of_Delayed_Payment','Num_Credit_Inquiries','Amount_invested_monthly','Monthly_Balance']).describe().show()

+-------+---------------------+----------------------+--------------------+-----------------------+--------------------+
|summary|Monthly_Inhand_Salary|Num_of_Delayed_Payment|Num_Credit_Inquiries|Amount_invested_monthly|     Monthly_Balance|
+-------+---------------------+----------------------+--------------------+-----------------------+--------------------+
|  count|                84998|                 92998|               98035|                  95521|               98800|
|   mean|    4194.170849600451|    30.923342437471774|   27.75425103279441|      637.4129984078696|-3.03643724696356...|
| stddev|     3183.68616687327|    226.03189164449924|   193.1773389800335|     2043.3193274670227|3.181295008384093E24|
|    min|    303.6454166666666|                    -1|                 0.0|                    0.0|-3333333333333333...|
|    max|   15204.633333333333|                   996|              2597.0|      999.8610676363429|   999.8134716639598|
+-------+---------------------+-

Since name and Type_of_Loan are categorical values, we have only described the numerical columns

Now we will also describe() all our columns

In [118]:
df.describe().show()

+-------+-------+-----------+------+------------------+------------------+-----------+----------+------------------+---------------------+------------------+-----------------+------------------+-----------------+--------------------+-------------------+----------------------+--------------------+--------------------+----------+-----------------+------------------------+--------------------+---------------------+-------------------+-----------------------+--------------------+--------------------+------------+
|summary|     ID|Customer_ID| Month|              Name|               Age|        SSN|Occupation|     Annual_Income|Monthly_Inhand_Salary| Num_Bank_Accounts|  Num_Credit_Card|     Interest_Rate|      Num_of_Loan|        Type_of_Loan|Delay_from_due_date|Num_of_Delayed_Payment|Changed_Credit_Limit|Num_Credit_Inquiries|Credit_Mix| Outstanding_Debt|Credit_Utilization_Ratio|  Credit_History_Age|Payment_of_Min_Amount|Total_EMI_per_month|Amount_invested_monthly|   Payment_Behaviour|     

##### <u>2.1.4. Check the datatypes of missing values

In [119]:
# we will be checking the data types of the missing value cols
df[['Name','Monthly_Inhand_Salary','Num_of_Delayed_Payment','Num_Credit_Inquiries','Amount_invested_monthly','Monthly_Balance']].dtypes

[('Name', 'string'),
 ('Monthly_Inhand_Salary', 'double'),
 ('Num_of_Delayed_Payment', 'string'),
 ('Num_Credit_Inquiries', 'double'),
 ('Amount_invested_monthly', 'string'),
 ('Monthly_Balance', 'string')]

##### <u>2.1.5. Filling missing vallues</u>
Now we need to impute the missing values. We can do that by dropping the rows that have missing values, filling in the missing values with a test
statistic (such as mean, mode, or median), or predicting the missing values using a machine learning algorithm.<br> 

#####<u>2.1.6. Filling the numerical columns</u>


In [76]:
df=df.fillna('ffill', subset=['Monthly_Inhand_Salary','Num_of_Delayed_Payment','Num_Credit_Inquiries','Amount_invested_monthly','Monthly_Balance'])

In [120]:
def dirty_data_finding(df):
    dirty = []
    columns = df.columns
    for col in columns:
        dtype = df[col].dtypes
        nunique = df[col].nunique()
        null = df[col].isnull().sum()
        duplicates = df[col].duplicated().sum()
        dirty.append([col,dtype,nunique,null,duplicates])
    df_dirty_data_finding = pd.DataFrame(dirty)
    df_dirty_data_finding.columns = ['column','dtype','nunique','null','duplicates']
    return df_dirty_data_finding

dirty_data_finding(df.toPandas())

Unnamed: 0,column,dtype,nunique,null,duplicates
0,ID,object,100000,0,0
1,Customer_ID,object,12500,0,87500
2,Month,object,8,0,99992
3,Name,object,10128,9985,89871
4,Age,float64,1661,4939,98338
5,SSN,object,12500,5572,87499
6,Occupation,object,15,7062,99984
7,Annual_Income,object,13487,0,86513
8,Monthly_Inhand_Salary,float64,13235,15002,86764
9,Num_Bank_Accounts,int64,943,0,99057


In [2]:
import re
import urllib.request
import sys
import os
import argparse
import json
import requests

In [5]:
def main():

    url = "https://www.bbc.co.uk/news/business-41779341"
    response = requests.get(url)
    if response.status_code != 200:
        print('Error: Could not get URL')
        sys.exit(1)

    text = response.text
    # regex = re.compile(r'\£\d+\.?\d*')
    regex=re.compile(r'£\d+\.?\d*|\d+\.?\d*bn|\d+\.?\d*bn euros|\d+\.?\d*bn euro|\d+\.?\d*bn pounds|\d+\.?\d*bn pound|\d+\.?\d*bn dollars|\d+\.?\d*bn dollar|\d+\.?\d*bn euros|\d+\.?\d*bn euro|\d+\.?\d*bn pounds|\d+\.?\d*bn pound|\d+\.?\d*bn dollars|\d+\.?\d*bn dollar|\d+\.?\d*bn euros|\d+\.?\d*bn euro|\d+\.?\d*bn pounds|\d+\.?\d*bn pound|\d+\.?\d*bn dollars|\d+\.?\d*bn dollar|\d+\.?\d*bn euros|\d+\.?\d*bn euro|\d+\.?\d*bn pounds|\d+\.?\d*bn pound|\d+\.?\d*bn dollars|\d+\.?\d*bn dollar|\d+\.?\d*bn euros|\d+\.?\d*bn euro|\d+\.?\d*bn pounds|\d+\.?\d*bn pound|\d+\.?\d*bn dollars|\d+\.?\d*bn dollar|\d+\.?\d*bn euros|\d+\.?\d*bn euro|\d+\.?\d*bn pounds|\d+\.?\d*bn pound|\d+\.?\d*bn dollars|\d+\.?\d*bn dollar|\d+\.?\d*bn euros|\d+\.?\d*bn euro|\d+\.?\d*bn pounds|\d+\.?\d*bn pound|\d+\.?\d*bn dollars|\d+\.?\d*bn dollar|\d+\.?\d*bn euros|\d+\.?\d*bn euro|\d+\.?\d*bn pounds|\d+\.?\d*bn pound|\d+\.?\d*bn dollars')
    matches = regex.findall(text)
    for match in matches:
        print(match)

if __name__ == '__main__':
    main()

0bn
1bn
0bn
0bn
0bn
0bn
0bn
0bn
0bn
0bn
0bn
0bn
0bn
0bn
0bn
1bn
1bn
131bn
£100
100bn
17.4bn
0bn
0bn
1bn
1bn
1bn
1bn
1bn
1bn
1bn
1bn
1bn
0bn
1bn
1bn
1bn
0bn
£600
131bn
£100
131bn
£100
100bn
17.4bn
100bn
17.4bn
£600
£600
£600


In [8]:
def main():

    url = "https://www.bbc.co.uk/news/business-41779341"
    response = requests.get(url)
    if response.status_code != 200:
        print('Error: Could not get URL')
        sys.exit(1)

    text = response.text
    # regex = re.compile(r'\£\d+\.?\d*')
    regex=re.compile(r'£\d+\.?\d*[m]?')
    matches = regex.findall(text)
    for match in matches:
        print(match)

if __name__ == '__main__':
    main()

£100
£600
£100
£100
£600
£600
£600


In [9]:
def main():

    url = "https://www.bbc.co.uk/news/business-41779341"
    response = requests.get(url)
    if response.status_code != 200:
        print('Error: Could not get URL')
        sys.exit(1)

    text = response.text
    # regex = re.compile(r'\£\d+\.?\d*')
    regex=re.compile(r'\$\d+\.?\d*[m]?')
    matches = regex.findall(text)
    for match in matches:
        print(match)

if __name__ == '__main__':
    main()

$131
$100
$17.4
$131
$131
$100
$17.4
$100
$17.4


In [10]:
def main():

    url = "https://www.bbc.co.uk/news/business-41779341"
    response = requests.get(url)
    if response.status_code != 200:
        print('Error: Could not get URL')
        sys.exit(1)

    text = response.text
    # regex = re.compile(r'\£\d+\.?\d*')
    regex=re.compile(r'\d+\.?\d*[m]?\s[euro]')
    matches = regex.findall(text)
    for match in matches:
        print(match)

if __name__ == '__main__':
    main()

0 r
2009 r
2009 r
2009 r


In [12]:
def main():

    url = "https://www.bbc.co.uk/news/business-41779341"
    response = requests.get(url)
    if response.status_code != 200:
        print('Error: Could not get URL')
        sys.exit(1)

    text = response.text
    # regex = re.compile(r'\£\d+\.?\d*')
    regex=re.compile(r'\d+\.?\d*[m]?\s[euros]')
    matches = regex.findall(text)
    for match in matches:
        print(match)

if __name__ == '__main__':
    main()

0 r
1 s
61 s
4800 s
2009 r
2009 r
2009 r


In [14]:
def main():

    url = "https://www.bbc.co.uk/news/business-41779341"
    response = requests.get(url)
    if response.status_code != 200:
        print('Error: Could not get URL')
        sys.exit(1)

    text = response.text
    # regex = re.compile(r'\£\d+\.?\d*')
    regex=re.compile(r'\d+\.?\d*[m]?\s[bn]')
    matches = regex.findall(text)
    for match in matches:
        print(match)

if __name__ == '__main__':
    main()

In [15]:
def main():

    url = "https://www.bbc.co.uk/news/business-41779341"
    response = requests.get(url)
    if response.status_code != 200:
        print('Error: Could not get URL')
        sys.exit(1)

    text = response.text
    # regex = re.compile(r'\£\d+\.?\d*')
    regex=re.compile(r'\d+\.?\d*[m]?\s[p]')
    matches = regex.findall(text)
    for match in matches:
        print(match)

if __name__ == '__main__':
    main()

2 p
2 p
2 p
2 p


In [18]:
def main():

    url = "https://www.bbc.co.uk/news/business-41779341"
    response = requests.get(url)
    if response.status_code != 200:
        print('Error: Could not get URL')
        sys.exit(1)

    text = response.text
    # regex = re.compile(r'\£\d+\.?\d*')
    regex=re.compile(r'\d+\.?\d*[m]?\s[euro]|\d+\.?\d*[m]?\s[euros]|\d+\.?\d*[m]?\s[bn]|\d+\.?\d*[m]?\s[p]')
    matches = regex.findall(text)
    for match in matches:
        print(match)

if __name__ == '__main__':
    main()

0 r
1 s
61 s
4800 s
2009 r
2 p
2009 r
2009 r
2 p
2 p
2 p


In [25]:
import re
x = input("Enter the text: ")
# regex for all phonenumbers
y = re.findall(r'\d{3}\.\d{3}\.\d{4}|\+\d{1}\-\(\d{3}\)\-\d{3}\-\d{4}|\d{1}\-\(\d{3}\)\-\d{3}\-\d{4}|\d{1}\-\d{3}\-\d{3}\-\d{4}|\d{3}\-\d{3}\-\d{4}|\d{3}\s\d{3}\s\d{4}|\(\d{3}\)\s\d{3}\s\d{4}|\(\d{3}\)\-\d{3}\-\d{4}|\d{10}|\d{3}\.\d{3}\.\d{4}|\d{3}\.\d{4}|\d{3}\-\d{4}|\d{10}|\d{11}', x)
print("-----"*20)
print(y)

Enter the text: +1-(800)-545-2468 2-(800)-545-2468 3-800-545-2468 555-123-3456 555 222 3342 (234) 234 2442 (243)-234-2342 1234567890 123.456.7890 123.4567 123-4567 1234567900 12345678900
----------------------------------------------------------------------------------------------------
['+1-(800)-545-2468', '2-(800)-545-2468', '3-800-545-2468', '555-123-3456', '555 222 3342', '(234) 234 2442', '(243)-234-2342', '1234567890', '123.456.7890', '123.4567', '123-4567', '1234567900', '1234567890']


In [26]:
# +1-(800)-545-2468
# 2-(800)-545-2468
# 3-800-545-2468
# 555-123-3456
# 555 222 3342
# (234) 234 2442
# (243)-234-2342
# 1234567890
# 123.456.7890
# 123.4567
# 123-4567
# 1234567900
# 12345678900
