# *Task 1 (CV*)

In [1]:
# ### For Colab Only ###
# !apt-get install openjdk-8-jdk-headless -qq > /dev/null
# !wget -q https://downloads.apache.org/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
# !tar xf spark-2.4.5-bin-hadoop2.7.tgz
# !pip install -q findspark

# import os
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
# os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

In [2]:
# ### For Colab Only ###
# from pydrive.auth import GoogleAuth
# from pydrive.drive import GoogleDrive
# from google.colab import auth
# from oauth2client.client import GoogleCredentials

# auth.authenticate_user()
# gauth = GoogleAuth()
# gauth.credentials = GoogleCredentials.get_application_default()
# drive = GoogleDrive(gauth)

In [3]:
# ### For Colab Only ###
# trainFile = drive.CreateFile({'id':"1axe3asDoDNJGQlv-iA9wp9laPtybEGzX"})
# trainFile.GetContentFile('train.csv')
# testFile = drive.CreateFile({'id':"1521WHxxG3SJHiPNknvuMcjepyoVY4X39"})
# testFile.GetContentFile('test.csv')

In [4]:
import findspark
# findspark.init()
findspark.init('/home/cse587/spark-2.4.0-bin-hadoop2.7') ## SPARK Initialization IN VM
import pyspark
from pyspark import SparkContext
sc = SparkContext()
from pyspark.sql import *
spark = SparkSession.builder.appName("Genre Prediction").config("spark.driver.memory", "15g").getOrCreate()

In [5]:
import pandas as pd  
data = pd.read_csv("train.csv") 
# data.head()
test = pd.read_csv("test.csv") 
# test.head()

In [6]:
sqlCtx = SQLContext(sc)
df = spark.createDataFrame(data)
test_df = spark.createDataFrame(test)

In [7]:
df.printSchema()
df.show(1)
# df.show(1, truncate = False)

root
 |-- movie_id: long (nullable = true)
 |-- movie_name: string (nullable = true)
 |-- plot: string (nullable = true)
 |-- genre: string (nullable = true)

+--------+----------+--------------------+--------------------+
|movie_id|movie_name|                plot|               genre|
+--------+----------+--------------------+--------------------+
|23890098|Taxi Blues|Shlykov, a hard-w...|['World cinema', ...|
+--------+----------+--------------------+--------------------+
only showing top 1 row



In [8]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

from pyspark.sql.functions import udf, col
from pyspark.sql.types import *
from ast import literal_eval
import json

# UDF to parse array stored as string using JSON
def parse_array_from_string(x):
    l = []
    x = x.replace('[', '') 
    x = x.replace(']', '') 
    x = x.replace("'", '') 
    res = x.split(',')
    for word in res:
        l.append(word.strip())
    return l

retrieve_array = udf(parse_array_from_string, ArrayType(StringType()))

def lower_case(x):
    res = []
    for x_ in x:
        res.append(x_.lower())
    return res

convert_to_lower = udf(lower_case, ArrayType(StringType()))

df = df.withColumn("label", convert_to_lower(retrieve_array(col("genre"))))

In [9]:
test_df.printSchema()
df.printSchema()
df.show(1)
# df.show(1, truncate= False)

root
 |-- movie_id: long (nullable = true)
 |-- movie_name: string (nullable = true)
 |-- plot: string (nullable = true)

root
 |-- movie_id: long (nullable = true)
 |-- movie_name: string (nullable = true)
 |-- plot: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- label: array (nullable = true)
 |    |-- element: string (containsNull = true)

+--------+----------+--------------------+--------------------+--------------------+
|movie_id|movie_name|                plot|               genre|               label|
+--------+----------+--------------------+--------------------+--------------------+
|23890098|Taxi Blues|Shlykov, a hard-w...|['World cinema', ...|[world cinema, dr...|
+--------+----------+--------------------+--------------------+--------------------+
only showing top 1 row



In [10]:
from pyspark.sql.functions import udf
from pyspark.sql.types import *
#df1 = df
#word1 = 'work'
def makingLabelsForLabelx(val):
    for word in val:
        if word1 == word:
            return 1
    return 0
genres = ['drama','comedy','romance film','thriller','action','world cinema','crime fiction','horror','black-and-white','indie','action/adventure',
'adventure','family film','short film','romantic drama','animation','musical','science fiction','mystery','romantic comedy']

for word in genres:
    word1 = word
    labeling = udf(makingLabelsForLabelx, IntegerType())
    df = df.withColumn(word, labeling("label"))

In [11]:
df.show(1);

+--------+----------+--------------------+--------------------+--------------------+-----+------+------------+--------+------+------------+-------------+------+---------------+-----+----------------+---------+-----------+----------+--------------+---------+-------+---------------+-------+---------------+
|movie_id|movie_name|                plot|               genre|               label|drama|comedy|romance film|thriller|action|world cinema|crime fiction|horror|black-and-white|indie|action/adventure|adventure|family film|short film|romantic drama|animation|musical|science fiction|mystery|romantic comedy|
+--------+----------+--------------------+--------------------+--------------------+-----+------+------------+--------+------+------------+-------------+------+---------------+-----+----------------+---------+-----------+----------+--------------+---------+-------+---------------+-------+---------------+
|23890098|Taxi Blues|Shlykov, a hard-w...|['World cinema', ...|[world cinema, dr..

In [12]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import regexp_replace
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.functions import lower, col

# Tokenize text
tokenizer = Tokenizer(inputCol='plot', outputCol='words_token')
df = tokenizer.transform(df)
test_df = tokenizer.transform(test_df)

# Remove stop words
remover = StopWordsRemover(inputCol='words_token', outputCol='words_clean')
df = remover.transform(df)
test_df = remover.transform(test_df)

In [13]:
import pathlib
model_path1 = "models/count-vectorizer-model"
file1 = pathlib.Path(model_path1)

# Fit a CountVectorizerModel from the corpus
from pyspark.ml.feature import CountVectorizer, CountVectorizerModel

if file1.exists():
    print("count-vectorizer-model exists")
    model = CountVectorizerModel.load(model_path1)
else:
    print("count-vectorizer-model does not exist... Training Model")
    cv = CountVectorizer(inputCol="words_clean", outputCol="features")
    model = cv.fit(df)
    print("Saving Trained Model")
    model.save(model_path1)

print("Transforming data")
rescaledData = model.transform(df)
test_rescaledData = model.transform(test_df)
print("Transforming data Done")


# featurizedData.show(2)

count-vectorizer-model exists
Transforming data
Transforming data Done


In [14]:
trainDF1 = rescaledData.select(col("drama").alias("label"), "features")
trainDF2 = rescaledData.select(col("comedy").alias("label"), "features")
trainDF3 = rescaledData.select(col("romance film").alias("label"), "features")
trainDF4 = rescaledData.select(col("thriller").alias("label"), "features")
trainDF5 = rescaledData.select(col("action").alias("label"), "features")
trainDF6 = rescaledData.select(col("world cinema").alias("label"), "features")
trainDF7 = rescaledData.select(col("crime fiction").alias("label"), "features")
trainDF8 = rescaledData.select(col("horror").alias("label"), "features")
trainDF9 = rescaledData.select(col("black-and-white").alias("label"), "features")
trainDF10 = rescaledData.select(col("indie").alias("label"), "features")
trainDF11 = rescaledData.select(col("action/adventure").alias("label"), "features")
trainDF12 = rescaledData.select(col("adventure").alias("label"), "features")
trainDF13 = rescaledData.select(col("family film").alias("label"), "features")
trainDF14 = rescaledData.select(col("short film").alias("label"), "features")
trainDF15 = rescaledData.select(col("romantic drama").alias("label"), "features")
trainDF16 = rescaledData.select(col("animation").alias("label"), "features")
trainDF17 = rescaledData.select(col("musical").alias("label"), "features")
trainDF18 = rescaledData.select(col("science fiction").alias("label"), "features")
trainDF19 = rescaledData.select(col("mystery").alias("label"), "features")
trainDF20 = rescaledData.select(col("romantic comedy").alias("label"), "features")

In [15]:
testDF = test_rescaledData.select("features")

In [16]:
from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

lrm = LogisticRegression()

In [17]:
model_path3 = "models/lr-model1"
file3 = pathlib.Path(model_path3)
if file3.exists():
    print("LR-model Exist")
    model1 = LogisticRegressionModel.load(model_path3)
else:
    print("LR-model Doesnot Exist...Training Model")
    model1 = lrm.fit(trainDF1)
    print("Saving Trained Model")
    model1.save(model_path3)
    
result1 = model1.transform(testDF)

LR-model Exist


In [18]:
model_path4 = "models/lr-model2"
file4 = pathlib.Path(model_path4)
if file4.exists():
    print("LR-model Exist")
    model2 = LogisticRegressionModel.load(model_path4)
else:
    print("LR-model Doesnot Exist...Training Model")
    model2 = lrm.fit(trainDF2)
    print("Saving Trained Model")
    model2.save(model_path4)
    
result2 = model2.transform(testDF)

LR-model Exist


In [19]:
model_path5 = "models/lr-model3"
file5 = pathlib.Path(model_path5)
if file5.exists():
    print("LR-model Exist")
    model3 = LogisticRegressionModel.load(model_path5)
else:
    print("LR-model Doesnot Exist...Training Model")
    model3 = lrm.fit(trainDF3)
    print("Saving Trained Model")
    model3.save(model_path5)
    
result3 = model3.transform(testDF)

LR-model Exist


In [20]:
model_path6 = "models/lr-model4"
file6 = pathlib.Path(model_path6)
if file6.exists():
    print("LR-model Exist")
    model4 = LogisticRegressionModel.load(model_path6)
else:
    print("LR-model Doesnot Exist...Training Model")
    model4 = lrm.fit(trainDF4)
    print("Saving Trained Model")
    model4.save(model_path6)
    
result4 = model4.transform(testDF)

LR-model Exist


In [21]:
model_path7 = "models/lr-model5"
file7 = pathlib.Path(model_path7)
if file7.exists():
    print("LR-model Exist")
    model5 = LogisticRegressionModel.load(model_path7)
else:
    print("LR-model Doesnot Exist...Training Model")
    model5 = lrm.fit(trainDF5)
    print("Saving Trained Model")
    model5.save(model_path7)
    
result5 = model5.transform(testDF)

LR-model Exist


In [22]:
model_path8 = "models/lr-model6"
file8 = pathlib.Path(model_path8)
if file8.exists():
    print("LR-model Exist")
    model6 = LogisticRegressionModel.load(model_path8)
else:
    print("LR-model Doesnot Exist...Training Model")
    model6 = lrm.fit(trainDF6)
    print("Saving Trained Model")
    model6.save(model_path8)
    
result6 = model6.transform(testDF)

LR-model Exist


In [23]:
model_path9 = "models/lr-model7"
file9 = pathlib.Path(model_path9)
if file9.exists():
    print("LR-model Exist")
    model7 = LogisticRegressionModel.load(model_path9)
else:
    print("LR-model Doesnot Exist...Training Model")
    model7 = lrm.fit(trainDF7)
    print("Saving Trained Model")
    model7.save(model_path9)
    
result7 = model7.transform(testDF)

LR-model Exist


In [24]:
model_path10 = "models/lr-model8"
file10 = pathlib.Path(model_path10)
if file10.exists():
    print("LR-model Exist")
    model8 = LogisticRegressionModel.load(model_path10)
else:
    print("LR-model Doesnot Exist...Training Model")
    model8 = lrm.fit(trainDF8)
    print("Saving Trained Model")
    model8.save(model_path10)
    
result8 = model8.transform(testDF)

LR-model Exist


In [25]:
model_path11 = "models/lr-model9"
file11 = pathlib.Path(model_path11)
if file11.exists():
    print("LR-model Exist")
    model9 = LogisticRegressionModel.load(model_path11)
else:
    print("LR-model Doesnot Exist...Training Model")
    model9 = lrm.fit(trainDF9)
    print("Saving Trained Model")
    model9.save(model_path11)
    
result9 = model9.transform(testDF)

LR-model Exist


In [26]:
model_path12 = "models/lr-model10"
file12 = pathlib.Path(model_path12)
if file12.exists():
    print("LR-model Exist")
    model10 = LogisticRegressionModel.load(model_path12)
else:
    print("LR-model Doesnot Exist...Training Model")
    model10 = lrm.fit(trainDF10)
    print("Saving Trained Model")
    model10.save(model_path12)
    
result10 = model10.transform(testDF)

LR-model Exist


In [27]:
model_path13 = "models/lr-model11"
file13 = pathlib.Path(model_path13)
if file13.exists():
    print("LR-model Exist")
    model11 = LogisticRegressionModel.load(model_path13)
else:
    print("LR-model Doesnot Exist...Training Model")
    model11 = lrm.fit(trainDF11)
    print("Saving Trained Model")
    model11.save(model_path13)
    
result11 = model11.transform(testDF)

LR-model Exist


In [28]:
model_path14 = "models/lr-model12"
file14 = pathlib.Path(model_path14)
if file14.exists():
    print("LR-model Exist")
    model12 = LogisticRegressionModel.load(model_path14)
else:
    print("LR-model Doesnot Exist...Training Model")
    model12 = lrm.fit(trainDF12)
    print("Saving Trained Model")
    model12.save(model_path14)
    
result12 = model12.transform(testDF)

LR-model Exist


In [29]:
model_path15 = "models/lr-model13"
file15 = pathlib.Path(model_path15)
if file15.exists():
    print("LR-model Exist")
    model13 = LogisticRegressionModel.load(model_path15)
else:
    print("LR-model Doesnot Exist...Training Model")
    model13 = lrm.fit(trainDF13)
    print("Saving Trained Model")
    model13.save(model_path15)
    
result13 = model13.transform(testDF)

LR-model Exist


In [30]:
model_path16 = "models/lr-model14"
file16 = pathlib.Path(model_path16)
if file16.exists():
    print("LR-model Exist")
    model14 = LogisticRegressionModel.load(model_path16)
else:
    print("LR-model Doesnot Exist...Training Model")
    model14 = lrm.fit(trainDF14)
    print("Saving Trained Model")
    model14.save(model_path16)
    
result14 = model14.transform(testDF)

LR-model Exist


In [31]:
model_path17 = "models/lr-model15"
file17 = pathlib.Path(model_path17)
if file17.exists():
    print("LR-model Exist")
    model15 = LogisticRegressionModel.load(model_path17)
else:
    print("LR-model Doesnot Exist...Training Model")
    model15 = lrm.fit(trainDF15)
    print("Saving Trained Model")
    model15.save(model_path17)
    
result15 = model15.transform(testDF)

LR-model Exist


In [32]:
model_path18 = "models/lr-model16"
file18 = pathlib.Path(model_path18)
if file18.exists():
    print("LR-model Exist")
    model16 = LogisticRegressionModel.load(model_path18)
else:
    print("LR-model Doesnot Exist...Training Model")
    model16 = lrm.fit(trainDF16)
    print("Saving Trained Model")
    model16.save(model_path18)
    
result16 = model16.transform(testDF)

LR-model Exist


In [33]:
model_path19 = "models/lr-model17"
file19 = pathlib.Path(model_path19)
if file19.exists():
    print("LR-model Exist")
    model17 = LogisticRegressionModel.load(model_path19)
else:
    print("LR-model Doesnot Exist...Training Model")
    model17 = lrm.fit(trainDF17)
    print("Saving Trained Model")
    model17.save(model_path19)
    
result17 = model17.transform(testDF)

LR-model Exist


In [34]:
model_path20 = "models/lr-model18"
file20 = pathlib.Path(model_path20)
if file20.exists():
    print("LR-model Exist")
    model18 = LogisticRegressionModel.load(model_path20)
else:
    print("LR-model Doesnot Exist...Training Model")
    model18 = lrm.fit(trainDF18)
    print("Saving Trained Model")
    model18.save(model_path20)
    
result18 = model18.transform(testDF)

LR-model Exist


In [35]:
model_path21 = "models/lr-model19"
file21 = pathlib.Path(model_path21)
if file21.exists():
    print("LR-model Exist")
    model19 = LogisticRegressionModel.load(model_path21)
else:
    print("LR-model Doesnot Exist...Training Model")
    model19 = lrm.fit(trainDF19)
    print("Saving Trained Model")
    model19.save(model_path21)
    
result19 = model19.transform(testDF)

LR-model Exist


In [36]:
model_path22 = "models/lr-model20"
file22 = pathlib.Path(model_path22)
if file22.exists():
    print("LR-model Exist")
    model20 = LogisticRegressionModel.load(model_path22)
else:
    print("LR-model Doesnot Exist...Training Model")
    model20 = lrm.fit(trainDF20)
    print("Saving Trained Model")
    model20.save(model_path22)
    
result20 = model20.transform(testDF)

LR-model Exist


In [37]:
result1 = result1.withColumnRenamed("prediction","prediction1")
result1.printSchema()
result2 = result2.withColumnRenamed("prediction","prediction2")
result3 = result3.withColumnRenamed("prediction","prediction3")
result4 = result4.withColumnRenamed("prediction","prediction4")
result5 = result5.withColumnRenamed("prediction","prediction5")
result6 = result6.withColumnRenamed("prediction","prediction6")
result7 = result7.withColumnRenamed("prediction","prediction7")
result8 = result8.withColumnRenamed("prediction","prediction8")
result9 = result9.withColumnRenamed("prediction","prediction9")
result10 = result10.withColumnRenamed("prediction","prediction10")
result11 = result11.withColumnRenamed("prediction","prediction11")
result12 = result12.withColumnRenamed("prediction","prediction12")
result13 = result13.withColumnRenamed("prediction","prediction13")
result14 = result14.withColumnRenamed("prediction","prediction14")
result15 = result15.withColumnRenamed("prediction","prediction15")
result16 = result16.withColumnRenamed("prediction","prediction16")
result17 = result17.withColumnRenamed("prediction","prediction17")
result18 = result18.withColumnRenamed("prediction","prediction18")
result19 = result19.withColumnRenamed("prediction","prediction19")
result20 = result20.withColumnRenamed("prediction","prediction20")

root
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction1: double (nullable = false)



In [38]:
t = result1.join(result2,on = ['features'],how = 'inner').select("features","prediction1","prediction2")
t = t.dropDuplicates(['features'])

t = t.join(result3,on = ['features'],how = 'inner').select("features","prediction1","prediction2","prediction3")
t = t.dropDuplicates(['features'])

t = t.join(result4,on = ['features'],how = 'inner').select("features","prediction1","prediction2","prediction3","prediction4")
t = t.dropDuplicates(['features'])

t = t.join(result5,on = ['features'],how = 'inner').select("features","prediction1","prediction2","prediction3","prediction4","prediction5")
t = t.dropDuplicates(['features'])

t = t.join(result6,on = ['features'],how = 'inner').select("features","prediction1","prediction2","prediction3","prediction4","prediction5","prediction6")
t = t.dropDuplicates(['features'])

t = t.join(result7,on = ['features'],how = 'inner').select("features","prediction1","prediction2","prediction3","prediction4","prediction5","prediction6","prediction7")
t = t.dropDuplicates(['features'])

t = t.join(result8,on = ['features'],how = 'inner').select("features","prediction1","prediction2","prediction3","prediction4","prediction5","prediction6","prediction7","prediction8")
t = t.dropDuplicates(['features'])

t = t.join(result9,on = ['features'],how = 'inner').select("features","prediction1","prediction2","prediction3","prediction4","prediction5","prediction6","prediction7","prediction8","prediction9")
t = t.dropDuplicates(['features'])

t = t.join(result10,on = ['features'],how = 'inner')
t = t.dropDuplicates(['features'])

t = t.join(result11,on = ['features'],how = 'inner')
t = t.dropDuplicates(['features'])

t = t.join(result12,on = ['features'],how = 'inner')
t = t.dropDuplicates(['features'])

t = t.join(result13,on = ['features'],how = 'inner')
t = t.dropDuplicates(['features'])

t = t.join(result14,on = ['features'],how = 'inner')
t = t.dropDuplicates(['features'])

t = t.join(result15,on = ['features'],how = 'inner')
t = t.dropDuplicates(['features'])

t = t.join(result16,on = ['features'],how = 'inner')
t = t.dropDuplicates(['features'])

t = t.join(result17,on = ['features'],how = 'inner')
t = t.dropDuplicates(['features'])

t = t.join(result18,on = ['features'],how = 'inner')
t = t.dropDuplicates(['features'])

t = t.join(result19,on = ['features'],how = 'inner')
t = t.dropDuplicates(['features'])

t = t.join(result20,on = ['features'],how = 'inner').select("features","prediction1","prediction2","prediction3","prediction4","prediction5","prediction6","prediction7","prediction8","prediction9","prediction10","prediction11","prediction12","prediction13","prediction14","prediction15","prediction16","prediction17","prediction18","prediction19","prediction20")
t = t.dropDuplicates(['features'])


In [39]:
test_rescaledData.printSchema()
test_rescaledData_final = test_rescaledData.select("movie_id","features")
t = t.join(test_rescaledData_final,on = ['features'],how = 'inner')
t = t.dropDuplicates(['movie_id'])
#t.show(5)

root
 |-- movie_id: long (nullable = true)
 |-- movie_name: string (nullable = true)
 |-- plot: string (nullable = true)
 |-- words_token: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- words_clean: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: vector (nullable = true)



In [40]:
from pyspark.sql.functions import udf
from pyspark.sql.types import *
import math

def makingLabelsForLabelx(val1,val2,val3,val4,val5,val6,val7,val8,val9,val10,val11,val12,val13,val14,val15,val16,val17,val18,val19,val20):
    val1 = str(math.floor(val1))
    val2 = str(math.floor(val2))
    val3 = str(math.floor(val3))
    val4 = str(math.floor(val4))
    val5 = str(math.floor(val5))
    val6 = str(math.floor(val6))
    val7 = str(math.floor(val7))
    val8 = str(math.floor(val8))
    val9 = str(math.floor(val9))
    val10 = str(math.floor(val10))
    val11 = str(math.floor(val11))
    val12 = str(math.floor(val12))
    val13 = str(math.floor(val13))
    val14 = str(math.floor(val14))
    val15 = str(math.floor(val15))
    val16 = str(math.floor(val16))
    val17 = str(math.floor(val17))
    val18 = str(math.floor(val18))
    val19 = str(math.floor(val19))
    val20 = str(math.floor(val20))
    return val1+" "+ val2+ " "+val3+ " "+ val4+ " "+ val5+ " "+ val6+ " "+ val7+ " "+val8 + " "+ val9+ " "+ val10+ " "+ val11+ " "+ val12+ " "+ val13+ " "+ val14+ " "+ val15+ " "+ val16+ " "+ val17+ " "+ val18+ " "+ val19+ " "+ val20

labeling = udf(makingLabelsForLabelx, StringType())
df_pred = t.withColumn("predictions", labeling("prediction1","prediction2","prediction3","prediction4","prediction5","prediction6","prediction7","prediction8","prediction9","prediction10","prediction11","prediction12","prediction13","prediction14","prediction15","prediction16","prediction17","prediction18","prediction19","prediction20"))

In [41]:
df_final = df_pred.select("movie_id","predictions")

In [42]:
df_final.show()

+--------+--------------------+
|movie_id|         predictions|
+--------+--------------------+
|23863620|1 0 0 1 0 0 0 0 0...|
|   62693|1 0 0 0 0 0 0 0 0...|
|24817139|1 0 0 0 0 0 0 0 0...|
|13333148|0 1 1 0 0 0 0 0 1...|
| 8269287|1 0 0 0 0 0 0 0 0...|
| 2268290|1 0 0 1 0 0 0 1 0...|
| 4950989|0 0 0 0 0 0 0 0 0...|
|33280938|1 0 1 0 0 0 0 0 0...|
| 1582173|1 0 0 0 0 0 0 1 0...|
| 1595142|0 0 0 0 0 0 0 0 0...|
|27996231|0 0 0 0 0 0 0 0 0...|
|11272068|1 0 0 0 0 0 0 0 0...|
|22822894|0 1 0 0 0 0 0 0 0...|
|28679286|0 0 0 0 0 0 0 1 0...|
|24003057|1 0 0 0 0 0 0 0 0...|
| 1356971|0 1 1 0 0 0 0 0 0...|
|36344884|1 0 0 0 0 0 0 0 0...|
|23658166|0 0 0 0 0 0 0 0 0...|
| 1600825|0 0 0 0 0 0 0 0 0...|
|27858537|0 1 0 0 0 0 0 0 0...|
+--------+--------------------+
only showing top 20 rows



In [43]:
df_final.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("results_task1")

In [44]:
df_final.count()

7777