# Big Data Machine Learning Classification with Spark

Proje Amacı: Spark Kullanarak müşterilerin davranışlarını tahmin etme

<img src="https://miro.medium.com/v2/resize:fit:800/1*MwZZjt-IlJU0TFaZxfBz1Q.jpeg" height="200">

<a href="https://drive.google.com/file/d/1Tptkdnvl_ycQUiq3GUCyYBy4FPyKV7oU/view?usp=drive_link">Dataya Buradan Erisebilirsiniz</a>

In [1]:
#pip install pyspark

In [2]:
import pandas as pd

In [3]:
df=pd.read_csv("churn.csv")

### EDA

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Names,Age,Total_Purchase,Account_Manager,Years,Num_Sites,Churn
0,0,Cameron Williams,42.0,11066.8,0,7.22,8.0,1
1,1,Kevin Mueller,41.0,11916.22,0,6.5,11.0,1
2,2,Eric Lozano,38.0,12884.75,0,6.67,12.0,1
3,3,Phillip White,42.0,8010.76,0,6.71,10.0,1
4,4,Cynthia Norton,37.0,9191.58,0,5.56,9.0,1


In [5]:
df.columns

Index(['Unnamed: 0', 'Names', 'Age', 'Total_Purchase', 'Account_Manager',
       'Years', 'Num_Sites', 'Churn'],
      dtype='object')

In [6]:
df["Churn"].unique()

array([1, 0])

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900 entries, 0 to 899
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       900 non-null    int64  
 1   Names            900 non-null    object 
 2   Age              900 non-null    float64
 3   Total_Purchase   900 non-null    float64
 4   Account_Manager  900 non-null    int64  
 5   Years            900 non-null    float64
 6   Num_Sites        900 non-null    float64
 7   Churn            900 non-null    int64  
dtypes: float64(4), int64(3), object(1)
memory usage: 56.4+ KB


In [8]:
df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
Names,0
Age,0
Total_Purchase,0
Account_Manager,0
Years,0
Num_Sites,0
Churn,0


In [9]:
df.describe()

Unnamed: 0.1,Unnamed: 0,Age,Total_Purchase,Account_Manager,Years,Num_Sites,Churn
count,900.0,900.0,900.0,900.0,900.0,900.0,900.0
mean,449.5,41.816667,10062.824033,0.481111,5.273156,8.587778,0.166667
std,259.951919,6.12756,2408.644532,0.499921,1.274449,1.764836,0.372885
min,0.0,22.0,100.0,0.0,1.0,3.0,0.0
25%,224.75,38.0,8497.1225,0.0,4.45,7.0,0.0
50%,449.5,42.0,10045.87,0.0,5.215,8.0,0.0
75%,674.25,46.0,11760.105,1.0,6.11,10.0,0.0
max,899.0,65.0,18026.01,1.0,9.15,14.0,1.0


In [10]:
df.corr(numeric_only=True)

Unnamed: 0.1,Unnamed: 0,Age,Total_Purchase,Account_Manager,Years,Num_Sites,Churn
Unnamed: 0,1.0,-0.037801,-0.018938,-0.067247,-0.143873,-0.378287,-0.645498
Age,-0.037801,1.0,-0.037208,-0.014749,0.005625,-0.00607,0.085926
Total_Purchase,-0.018938,-0.037208,1.0,0.015856,-0.005623,-0.00339,0.024031
Account_Manager,-0.067247,-0.014749,0.015856,1.0,0.02293,0.033401,0.070611
Years,-0.143873,0.005625,-0.005623,0.02293,1.0,0.051642,0.214329
Num_Sites,-0.378287,-0.00607,-0.00339,0.033401,0.051642,1.0,0.525398
Churn,-0.645498,0.085926,0.024031,0.070611,0.214329,0.525398,1.0


### Feature Engineering

In [11]:
dff=df.drop("Names",axis=1)

In [12]:
dff.to_csv('chr.csv', index=False)

### Modelling

In [13]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

spark = SparkSession.builder.appName("Sparklamüşterisiniflandirma").getOrCreate()

data = spark.read.csv("chr.csv", inferSchema=True,header=True)
predictors = data.columns[:-1]
assembler = VectorAssembler(inputCols=predictors, outputCol="features")
data = assembler.transform(data).select("features", "Churn")

train_data, test_data = data.randomSplit([0.70, 0.30], seed=42)
GPT = GBTClassifier(labelCol="Churn", featuresCol="features")
model = GPT.fit(train_data)
predictions = model.transform(test_data)
evaluator = BinaryClassificationEvaluator(labelCol="Churn")
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)

spark.stop()

Accuracy: 0.9973262032085561
