In [46]:
import csv
import os
import sys
# Spark imports
from pyspark.rdd import RDD
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.functions import desc
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, col
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, Word2Vec
import numpy 

from csv import reader

In [43]:
def init_spark():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    return spark

<Font size=5 color=red>Investigate the original dataset (obviously, it cannot be used). Take a look at https://stackoverflow.com/questions/13793529/r-error-invalid-type-list-for-variable to see how useless the Body column information could be!

The point here is that the body information consists mostly of codes and some weird patterns that are not useful for our purpose. The most important information here is the connection between the title of the questions and tags. So, I removed the Body column from the dataset.</Font>

In [3]:
try:
    spark = init_spark()

    filename1 = "./Train.csv"
    df2 = spark.read.option("multiLine", 'true').option("escape","\'").csv(filename1, header=True)
    print(df2.count())
    print(df2.show(10))    
except:
    pass

62819203
+--------------------+--------------------+--------------------+--------------------+
|                  Id|               Title|                Body|                Tags|
+--------------------+--------------------+--------------------+--------------------+
|                   1|How to check if a...|<p>I'd like to ch...|php image-process...|
|                   2|How can I prevent...|<p>In my favorite...|             firefox|
|                   3|R Error Invalid t...|"<p>I am import m...|                null|
|      expert_trai...|                null|                null|                null|
|      expert_data...|                null|                null|                null|
|      rf_model = ...| data=expert_data...|     importance=TRUE|      do.trace=100);|
|                   }|                null|                null|                null|
|       </code></pre>|                null|                null|                null|
|<p>Structure of t...|                null|  

<Font size=5 color=red>For removing the Body column, I read all the dataset once using Pandas library. After that, I removed the column and got an export to have a concrete file as our dataset. This part has been ommited from the notebook.</Font>

In [5]:
spark = init_spark()

filename = "./TrainWithoutBody.csv"
df1 = spark.read.option("multiLine", 'true').option("escape","\'").csv(filename, header=True)
df1 = df1.drop("_c0")
df1 = df1.dropna()

rddTags = df1.select("Tags").rdd

df1.count()

22/03/28 17:38:06 WARN Utils: Your hostname, Ashrafs-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.0.0.239 instead (on interface en0)
22/03/28 17:38:06 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/03/28 17:38:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
                                                                                

6017243

In [5]:
df1.show(5)

+---+--------------------+--------------------+
| Id|               Title|                Tags|
+---+--------------------+--------------------+
|  1|How to check if a...|php image-process...|
|  2|How can I prevent...|             firefox|
|  3|R Error Invalid t...|r matlab machine-...|
|  4|How do I replace ...|     c# url encoding|
|  5|How to modify who...|php api file-get-...|
+---+--------------------+--------------------+
only showing top 5 rows



<Font size = 5, color=green>Finding the 100 most used tags (one DT per each most used tag)

In [6]:
splittedTags = rddTags.filter(lambda r: r[0] != None).flatMap(lambda r: r[0].split(" ")).map(lambda r: (r, 1)).reduceByKey(lambda x, y: x + y)

splittedTags = splittedTags.sortBy(lambda r: r[1], False) #Sorted with number of usage (you can collect and see)

splittedTagsSorted = splittedTags.map(lambda r: r[0]) #Delete this line if you want to see number of times they have been used.

# df10 = anSorted.toDF()


mostUsedTags = splittedTagsSorted.collect()[0:100]



                                                                                

In [7]:
mostUsedTags

['c#',
 'java',
 'php',
 'javascript',
 'android',
 'jquery',
 'c++',
 'python',
 'iphone',
 'asp.net',
 'mysql',
 'html',
 '.net',
 'ios',
 'objective-c',
 'sql',
 'css',
 'linux',
 'ruby-on-rails',
 'windows',
 'c',
 'sql-server',
 'ruby',
 'wpf',
 'xml',
 'ajax',
 'database',
 'regex',
 'windows-7',
 'asp.net-mvc',
 'xcode',
 'django',
 'osx',
 'arrays',
 'vb.net',
 'eclipse',
 'json',
 'facebook',
 'ruby-on-rails-3',
 'ubuntu',
 'performance',
 'networking',
 'string',
 'multithreading',
 'winforms',
 'security',
 'asp.net-mvc-3',
 'visual-studio-2010',
 'bash',
 'homework',
 'image',
 'wcf',
 'html5',
 'wordpress',
 'web-services',
 'visual-studio',
 'forms',
 'algorithm',
 'sql-server-2008',
 'linq',
 'oracle',
 'git',
 'query',
 'perl',
 'apache2',
 'flash',
 'actionscript-3',
 'ipad',
 'spring',
 'apache',
 'silverlight',
 'email',
 'r',
 'cocoa-touch',
 'cocoa',
 'swing',
 'hibernate',
 'excel',
 'entity-framework',
 'file',
 'shell',
 'flex',
 'api',
 'list',
 'internet-explo

In [8]:
print(rddTags.count())

[Stage 5:>                                                          (0 + 1) / 1]

6017243


                                                                                

In [9]:
rddTags.take(10)

[Row(Tags='php image-processing file-upload upload mime-types'),
 Row(Tags='firefox'),
 Row(Tags='r matlab machine-learning'),
 Row(Tags='c# url encoding'),
 Row(Tags='php api file-get-contents'),
 Row(Tags='proxy active-directory jmeter'),
 Row(Tags='core-plot'),
 Row(Tags='c# asp.net windows-phone-7'),
 Row(Tags='.net javascript code-generation'),
 Row(Tags='sql variables parameters procedure calls')]

<Font size=5.5, color="purpule">Here, I have cleaned the Tags column to only contain the most used tags. For example, I ommited the "upload" tag from first group of tags for the first question, because it's not a most used tag.

In [9]:
def replaceNoneWithString(x):
    if (x == None): return "None"
    else : return x

In [10]:
rrr = rddTags.map(lambda r: r[0]).map(replaceNoneWithString).map(lambda r: r.split(" ")).map(lambda r: [ped for ped in r if ped in mostUsedTags])
cleanedTags = rrr.take(10)
cleanedTags

[['php'],
 ['firefox'],
 ['r'],
 ['c#'],
 ['php', 'api'],
 [],
 [],
 ['c#', 'asp.net'],
 ['.net', 'javascript'],
 ['sql']]

# Subject titles to TF-IDF

In [11]:
tokenizer = Tokenizer(inputCol="Title", outputCol="transformed_tfidf")
wordsData = tokenizer.transform(df1)

hashingTF = HashingTF(inputCol="transformed_tfidf", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)

featurizedData.take(1)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

                                                                                

In [12]:
rescaledData.select("id", "tags", "features").show()
rescaledData.take(1)

+---+--------------------+--------------------+
| id|                tags|            features|
+---+--------------------+--------------------+
|  1|php image-process...|(20,[1,3,4,8,9,10...|
|  2|             firefox|(20,[1,2,3,7,12,1...|
|  3|r matlab machine-...|(20,[2,4,5,6,7,17...|
|  4|     c# url encoding|(20,[3,6,7,10,13,...|
|  5|php api file-get-...|(20,[3,5,8,13,15,...|
|  6|proxy active-dire...|(20,[0,3,4,7,13,1...|
|  7|           core-plot|(20,[3,6,7,8,10,1...|
|  8|c# asp.net window...|(20,[0,3,7,8,10,1...|
|  9|.net javascript c...|(20,[0,1,3,4],[1....|
| 10|sql variables par...|(20,[0,3,6,12,16,...|
| 11|.net obfuscation ...|(20,[0,3,6,8,11,1...|
| 12|algorithm languag...|(20,[0,1,5,19],[1...|
| 13|postfix migration...|(20,[1,3,8,18,19]...|
| 14|documentation lat...|(20,[10,13,16,17,...|
| 15|           windows-7|(20,[1,2,12,13,15...|
| 16|php url-routing c...|(20,[2,4,10,18],[...|
| 17|   r temporary-files|(20,[3,5,8,13,15,...|
| 18|         wpf binding|(20,[0,8,12,13

[Row(Id='1', Title='How to check if an uploaded file is an image without mime type?', Tags='php image-processing file-upload upload mime-types', transformed_tfidf=['how', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'without', 'mime', 'type?'], rawFeatures=SparseVector(20, {1: 1.0, 3: 3.0, 4: 1.0, 8: 2.0, 9: 1.0, 10: 2.0, 12: 2.0, 16: 1.0}), features=SparseVector(20, {1: 1.1357, 3: 1.8792, 4: 1.1753, 8: 1.4617, 9: 1.1974, 10: 2.0765, 12: 2.0979, 16: 0.9553}))]

# Subject titles to Word2Vec

In [13]:
tokenizer = Tokenizer(inputCol="Title", outputCol="tokenized_text")
tokenized_df = tokenizer.transform(df1)

word2Vec = Word2Vec(inputCol="tokenized_text", outputCol="features", vectorSize=100)
w2v_model = word2Vec.fit(tokenized_df)
w2v_data = w2v_model.transform(tokenized_df)


22/03/28 17:42:43 WARN MemoryStore: Not enough space to cache rdd_60_0 in memory! (computed 248.3 MiB so far)
22/03/28 17:42:43 WARN BlockManager: Block rdd_60_0 could not be removed as it was not found on disk or in memory
22/03/28 17:42:43 WARN BlockManager: Putting block rdd_60_0 failed
22/03/28 17:42:44 WARN MemoryStore: Not enough space to cache broadcast_25 in memory! (computed 60.2 MiB so far)
22/03/28 17:42:45 WARN MemoryStore: Not enough space to cache broadcast_22 in memory! (computed 68.8 MiB so far)
22/03/28 17:42:45 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/03/28 17:42:45 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
                                                                                

# Sample the Dataset

In [18]:
sampled_df = df1.sample(False, 0.0083, seed=42)

# Sampled Word2Vec

In [35]:
tokenizer_sampled = Tokenizer(inputCol="Title", outputCol="tokenized_text")
tokenized_df_sampled = tokenizer.transform(sampled_df)

word2Vec_sampled = Word2Vec(inputCol="tokenized_text", outputCol="features", vectorSize=100)
w2v_model_sampled = word2Vec_sampled.fit(tokenized_df_sampled)
w2v_data_sampled = w2v_model_sampled.transform(tokenized_df_sampled)

                                                                                

In [36]:
w2v_data_sampled.take(1)

[Row(Id='256', Title='How to figure out all colors in a gradient?', Tags='javascript html html5 colors mobile-safari', tokenized_text=['how', 'to', 'figure', 'out', 'all', 'colors', 'in', 'a', 'gradient?'], features=DenseVector([0.0453, 0.011, 0.0179, 0.0552, 0.0396, -0.072, -0.0676, 0.0642, -0.0216, 0.0092, 0.003, -0.0752, -0.0464, -0.0993, 0.0792, -0.0697, 0.0541, 0.0066, 0.0379, -0.0012, -0.1039, 0.0168, 0.0468, 0.0678, -0.0138, 0.0469, -0.043, 0.0017, -0.0636, -0.0329, -0.0881, -0.0813, 0.0638, -0.0253, 0.019, -0.0105, -0.0076, -0.1132, -0.0675, 0.0242, 0.0062, -0.0018, 0.0677, 0.1666, -0.0445, 0.0517, -0.0105, 0.0297, -0.0228, 0.0368, 0.0895, -0.0697, -0.0205, -0.0536, -0.0087, 0.0111, -0.025, 0.0166, 0.0306, -0.0238, -0.0804, -0.0428, -0.0046, 0.0559, 0.0364, 0.0476, -0.007, -0.0841, 0.0906, 0.0261, 0.0482, 0.0353, 0.0064, -0.1306, -0.0393, -0.0606, 0.0143, 0.0826, -0.017, 0.0578, -0.1151, 0.0087, 0.1127, -0.0206, -0.0386, -0.0009, -0.0453, 0.0112, -0.0217, 0.0935, -0.0431, 0.041

# Logistic Regression 

In [44]:
# targets = [lambda r: r.split(' '), w2v_data_sampled.select("Tags").collect()]
target_tags = w2v_data_sampled.select(split(col("Tags")," ").alias("Tag_Array"))
target_tags.take(1)

[Row(Tag_Array=['javascript', 'html', 'html5', 'colors', 'mobile-safari'])]

In [45]:

def my_function(r):
    r = r.Tag_Array
    target = []
    for t in mostUsedTags:
        if t in r:
            target.append(1)
        else:
            target.append(0)
    return (r.id, r.Title, r.features, target)

w2v_with_targets = w2v_data_sampled.rdd.map(lambda r: my_function(r))
w2v_with_targets.take(1)

22/03/28 19:05:47 ERROR Executor: Exception in task 0.0 in stage 43.0 (TID 43)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 604, in main
    process()
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 596, in process
    serializer.dump_stream(out_iter, outfile)
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 259, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/rdd.py", line 1560, in takeUpToNumLeft
    yield next(iterator)
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 138, in load_stream
    yield self._read_with_length(stream)
  File "/op

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 43.0 failed 1 times, most recent failure: Lost task 0.0 in stage 43.0 (TID 43) (10.0.0.239 executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 604, in main
    process()
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 596, in process
    serializer.dump_stream(out_iter, outfile)
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 259, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/rdd.py", line 1560, in takeUpToNumLeft
    yield next(iterator)
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 138, in load_stream
    yield self._read_with_length(stream)
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 160, in _read_with_length
    return self.loads(obj)
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 430, in loads
    return pickle.loads(obj, encoding=encoding)
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/sql/types.py", line 893, in _parse_datatype_json_string
    return _parse_datatype_json_value(json.loads(json_string))
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/sql/types.py", line 910, in _parse_datatype_json_value
    return _all_complex_types[tpe].fromJson(json_value)
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/sql/types.py", line 596, in fromJson
    return StructType([StructField.fromJson(f) for f in json["fields"]])
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/sql/types.py", line 596, in <listcomp>
    return StructType([StructField.fromJson(f) for f in json["fields"]])
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/sql/types.py", line 441, in fromJson
    _parse_datatype_json_value(json["type"]),
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/sql/types.py", line 912, in _parse_datatype_json_value
    return UserDefinedType.fromJson(json_value)
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/sql/types.py", line 755, in fromJson
    m = __import__(pyModule, globals(), locals(), [pyClass])
  File "<frozen zipimport>", line 259, in load_module
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/ml/__init__.py", line 22, in <module>
    from pyspark.ml.base import Estimator, Model, Predictor, PredictionModel, \
  File "<frozen zipimport>", line 259, in load_module
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/ml/base.py", line 25, in <module>
    from pyspark.ml.param.shared import HasInputCol, HasOutputCol, HasLabelCol, HasFeaturesCol, \
  File "<frozen zipimport>", line 259, in load_module
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/ml/param/__init__.py", line 21, in <module>
    import numpy as np
ModuleNotFoundError: No module named 'numpy'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:517)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:652)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:635)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:470)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:315)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:313)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:307)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:307)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:294)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:288)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$runJob$1(PythonRDD.scala:166)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2236)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2258)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2207)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2206)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2206)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1079)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1079)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1079)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2445)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2387)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2376)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2196)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2217)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2236)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:166)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:750)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 604, in main
    process()
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 596, in process
    serializer.dump_stream(out_iter, outfile)
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 259, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/rdd.py", line 1560, in takeUpToNumLeft
    yield next(iterator)
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 138, in load_stream
    yield self._read_with_length(stream)
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 160, in _read_with_length
    return self.loads(obj)
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 430, in loads
    return pickle.loads(obj, encoding=encoding)
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/sql/types.py", line 893, in _parse_datatype_json_string
    return _parse_datatype_json_value(json.loads(json_string))
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/sql/types.py", line 910, in _parse_datatype_json_value
    return _all_complex_types[tpe].fromJson(json_value)
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/sql/types.py", line 596, in fromJson
    return StructType([StructField.fromJson(f) for f in json["fields"]])
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/sql/types.py", line 596, in <listcomp>
    return StructType([StructField.fromJson(f) for f in json["fields"]])
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/sql/types.py", line 441, in fromJson
    _parse_datatype_json_value(json["type"]),
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/sql/types.py", line 912, in _parse_datatype_json_value
    return UserDefinedType.fromJson(json_value)
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/sql/types.py", line 755, in fromJson
    m = __import__(pyModule, globals(), locals(), [pyClass])
  File "<frozen zipimport>", line 259, in load_module
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/ml/__init__.py", line 22, in <module>
    from pyspark.ml.base import Estimator, Model, Predictor, PredictionModel, \
  File "<frozen zipimport>", line 259, in load_module
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/ml/base.py", line 25, in <module>
    from pyspark.ml.param.shared import HasInputCol, HasOutputCol, HasLabelCol, HasFeaturesCol, \
  File "<frozen zipimport>", line 259, in load_module
  File "/opt/anaconda3/envs/lab1/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/ml/param/__init__.py", line 21, in <module>
    import numpy as np
ModuleNotFoundError: No module named 'numpy'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:517)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:652)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:635)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:470)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:315)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:313)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:307)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:307)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:294)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:288)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$runJob$1(PythonRDD.scala:166)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2236)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
