# La régression linéaire 

Notre travail consiste essentiellement à examiner un ensemble de données avec des données e-commerce pour le site Web et l'application mobile d'une entreprise. Ensuite, nous voulons voir si nous pouvons créer un modèle de régression permettant de prédire les dépenses annuelles du client sur les produits de l'entreprise.

démarrer une session Spark

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('lr_example').getOrCreate()

In [2]:
from pyspark.ml.regression import LinearRegression

In [4]:
data = spark.read.csv("Ecommerce_Customers.csv",inferSchema=True,header=True)

In [5]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [6]:
data.show()

+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|               Email|             Address|          Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|          Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|
|   hduke@hotmail.com|4547 Archer Commo...|       DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|
|    pallen@yahoo.com|24645 Valerie Uni...|          Bisque|33.000914755642675|11.330278057777512|37.110597442120856|   4.104543202376424| 487.54750486747207|
|riverarebecca@gma...|1414 David Throug...|   

In [7]:
data.head()

Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005)

In [8]:
for item in data.head():
    print(item)

mstephenson@fernandez.com
835 Frank TunnelWrightmouth, MI 82180-9605
Violet
34.49726772511229
12.65565114916675
39.57766801952616
4.0826206329529615
587.9510539684005


## Setting Up DataFrame for Machine Learning 

In [9]:
# comme dans le notebook precendent l'API Spark ML doit recevoir en entrée un dataframe !
# de cette forme 
# ("label","features")


from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [10]:
data.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [11]:
assembler = VectorAssembler(
    inputCols=["Avg Session Length", "Time on App", 
               "Time on Website",'Length of Membership'],
    outputCol="features")

In [12]:
output = assembler.transform(data)

In [13]:
output.select("features").show()

+--------------------+
|            features|
+--------------------+
|[34.4972677251122...|
|[31.9262720263601...|
|[33.0009147556426...|
|[34.3055566297555...|
|[33.3306725236463...|
|[33.8710378793419...|
|[32.0215955013870...|
|[32.7391429383803...|
|[33.9877728956856...|
|[31.9365486184489...|
|[33.9925727749537...|
|[33.8793608248049...|
|[29.5324289670579...|
|[33.1903340437226...|
|[32.3879758531538...|
|[30.7377203726281...|
|[32.1253868972878...|
|[32.3388993230671...|
|[32.1878120459321...|
|[32.6178560628234...|
+--------------------+
only showing top 20 rows



In [14]:
output.show()

+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+--------------------+
|               Email|             Address|          Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|            features|
+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+--------------------+
|mstephenson@ferna...|835 Frank TunnelW...|          Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|[34.4972677251122...|
|   hduke@hotmail.com|4547 Archer Commo...|       DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|[31.9262720263601...|
|    pallen@yahoo.com|24645 Valerie Uni...|          Bisque|33.000914755642675|11.330278057777512|37

In [15]:
final_data = output.select("features",'Yearly Amount Spent')

In [16]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [17]:
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                355|
|   mean| 500.61064740300236|
| stddev|  81.89229980641147|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [18]:
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                145|
|   mean| 496.13958138779043|
| stddev|  72.79255976071663|
|    min| 302.18954780965197|
|    max|  712.3963268096637|
+-------+-------------------+



In [19]:
# créer un modèle de régression 
lr = LinearRegression(labelCol='Yearly Amount Spent')

In [20]:

lrModel = lr.fit(train_data,)

In [21]:
# afficher les coéfficients et l'interception 
print("Coefficients: {} Intercept: {}".format(lrModel.coefficients,lrModel.intercept))

Coefficients: [25.976756136682653,39.03144547839342,0.07997171913420675,61.5986867636123] Intercept: -1049.742718585099


In [22]:
test_results = lrModel.evaluate(test_data)

In [23]:

test_results.residuals.show()

+-------------------+
|          residuals|
+-------------------+
| 11.144131397921456|
|  6.402398794928104|
| -6.700440344240917|
| -7.858334888044965|
|  9.242961760717208|
| -4.651431143658101|
|  21.29507871300848|
| 2.4859850528113157|
|   -4.5552643330509|
|  2.632710940458253|
|  3.589099816427108|
| -5.199784875234457|
|  6.296694571666819|
| -6.748540527222872|
| -4.745870842882653|
| -4.275192635176381|
|-18.854297385539496|
| -2.780564253221371|
|  7.652085748158868|
|-2.6877385269435763|
+-------------------+
only showing top 20 rows



In [24]:
unlabeled_data = test_data.select('features')

In [25]:
predictions = lrModel.transform(unlabeled_data)

In [26]:
predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[29.5324289670579...|  397.496219674706|
|[30.9716756438877...| 488.2362109619646|
|[31.0613251567161...| 494.2558984021425|
|[31.1280900496166...| 565.1110216350996|
|[31.1695067987115...| 418.1135690415756|
|[31.2681042107507...|  428.121964317482|
|[31.2834474760581...|  570.486010712659|
|[31.3091926408918...| 430.2347327871223|
|[31.4252268808548...| 535.3219829878128|
|[31.4459724827577...| 482.2442539946703|
|[31.5316044825729...|432.92650591293545|
|[31.6253601348306...|381.53668563215865|
|[31.6548096756927...| 468.9667291558817|
|[31.7207699002873...| 545.5234740052458|
|[31.7656188210424...| 501.2999524784898|
|[31.8124825597242...| 397.0855376189736|
|[31.8164283341993...| 519.9767888891959|
|[31.8186165667690...|  449.199237623357|
|[31.8209982016720...| 417.0231952650545|
|[31.8530748017465...|461.97286198929555|
+--------------------+------------

In [27]:
print("RMSE: {}".format(test_results.rootMeanSquaredError))
print("MSE: {}".format(test_results.meanSquaredError))

RMSE: 10.706499461315444
MSE: 114.6291307151479
