In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import isnan, when, count, col
spark = SparkSession.builder.appName('733').getOrCreate()
sc = spark.sparkContext

In [2]:
annual_df = spark.read.csv('../annual_compustat.csv', header=True, inferSchema=True).limit(1000).cache()

In [3]:
nullcounts = spark.read.csv('annual_compustat_null_count.csv', header=False)

In [4]:
import csv

with open('annual_compustat_null_count.csv', 'r') as f:
  reader = csv.reader(f)
  your_list = list(reader)



In [5]:
null_count_list = your_list[0]

In [6]:
null_count_list = [float(x) for x in null_count_list]

In [7]:
good_columns = []
for i in range(0, len(null_count_list)):
    if null_count_list[i]==0:
        good_columns.append(i)

In [9]:
great_columns = [annual_df.columns[i] for i in good_columns]

In [10]:
great_columns.append('rea')

In [11]:
columns_num = [3, 10, 14]
annual_df = annual_df.select(*great_columns)


In [12]:
some_dict = {}
for x in annual_df.columns:
    some_dict[x] = 0


In [13]:
permuted_annual_df = annual_df.fillna(some_dict)

In [14]:
permuted_annual_dtypes = permuted_annual_df.dtypes

In [15]:
non_string_columns = [k for (k,v) in permuted_annual_dtypes if v != 'string']

In [16]:
permuted_annual_df_no_strings = permuted_annual_df.select(*non_string_columns)

In [17]:
feature_columns = [item for item in permuted_annual_df_no_strings.columns if item not in ['rea', 'features']]

In [18]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=feature_columns, outputCol="features")

final_df = assembler.transform(permuted_annual_df_no_strings
)

In [19]:
final_final_df = final_df.drop(*feature_columns)

In [20]:
final_final_df.show()

+------+--------------------+
|   rea|            features|
+------+--------------------+
|   0.0|[1000.0,1.9611231...|
|   0.0|[1000.0,1.9621231...|
|   0.0|[1000.0,1.9631231...|
|   0.0|[1000.0,1.9641231...|
|   0.0|[1000.0,1.9651231...|
|   0.0|[1000.0,1.9661231...|
|   0.0|[1000.0,1.9671231...|
|   0.0|[1000.0,1.9681231...|
| 2.772|[1000.0,1.9691231...|
|   0.0|[1000.0,1.9701231...|
|   0.0|[1000.0,1.9711231...|
|   0.0|[1000.0,1.9721231...|
|   0.0|[1000.0,1.9731231...|
|   0.0|[1000.0,1.9741231...|
|-1.656|[1000.0,1.9751231...|
|   0.0|[1000.0,1.9761231...|
|   0.0|[1000.0,1.9771231...|
|   0.0|[1001.0,1.9781231...|
|   0.0|[1001.0,1.9791231...|
|   0.0|[1001.0,1.9801231...|
+------+--------------------+
only showing top 20 rows



In [21]:
final_final_df = final_final_df.withColumn('label', final_final_df.rea)

In [22]:
final_final_df.show()

+------+--------------------+------+
|   rea|            features| label|
+------+--------------------+------+
|   0.0|[1000.0,1.9611231...|   0.0|
|   0.0|[1000.0,1.9621231...|   0.0|
|   0.0|[1000.0,1.9631231...|   0.0|
|   0.0|[1000.0,1.9641231...|   0.0|
|   0.0|[1000.0,1.9651231...|   0.0|
|   0.0|[1000.0,1.9661231...|   0.0|
|   0.0|[1000.0,1.9671231...|   0.0|
|   0.0|[1000.0,1.9681231...|   0.0|
| 2.772|[1000.0,1.9691231...| 2.772|
|   0.0|[1000.0,1.9701231...|   0.0|
|   0.0|[1000.0,1.9711231...|   0.0|
|   0.0|[1000.0,1.9721231...|   0.0|
|   0.0|[1000.0,1.9731231...|   0.0|
|   0.0|[1000.0,1.9741231...|   0.0|
|-1.656|[1000.0,1.9751231...|-1.656|
|   0.0|[1000.0,1.9761231...|   0.0|
|   0.0|[1000.0,1.9771231...|   0.0|
|   0.0|[1001.0,1.9781231...|   0.0|
|   0.0|[1001.0,1.9791231...|   0.0|
|   0.0|[1001.0,1.9801231...|   0.0|
+------+--------------------+------+
only showing top 20 rows



In [23]:
# final_final_df.write.parquet("final_final_df2.parquet")

In [24]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
ml_df = sqlContext.read.parquet("final_final_df2.parquet")

In [25]:
ml_df.show()

+------+--------------------+------+
|   rea|            features| label|
+------+--------------------+------+
|   0.0|[1000.0,1.9611231...|   0.0|
|   0.0|[1000.0,1.9621231...|   0.0|
|   0.0|[1000.0,1.9631231...|   0.0|
|   0.0|[1000.0,1.9641231...|   0.0|
|   0.0|[1000.0,1.9651231...|   0.0|
|   0.0|[1000.0,1.9661231...|   0.0|
|   0.0|[1000.0,1.9671231...|   0.0|
|   0.0|[1000.0,1.9681231...|   0.0|
| 2.772|[1000.0,1.9691231...| 2.772|
|   0.0|[1000.0,1.9701231...|   0.0|
|   0.0|[1000.0,1.9711231...|   0.0|
|   0.0|[1000.0,1.9721231...|   0.0|
|   0.0|[1000.0,1.9731231...|   0.0|
|   0.0|[1000.0,1.9741231...|   0.0|
|-1.656|[1000.0,1.9751231...|-1.656|
|   0.0|[1000.0,1.9761231...|   0.0|
|   0.0|[1000.0,1.9771231...|   0.0|
|   0.0|[1001.0,1.9781231...|   0.0|
|   0.0|[1001.0,1.9791231...|   0.0|
|   0.0|[1001.0,1.9801231...|   0.0|
+------+--------------------+------+
only showing top 20 rows



In [26]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Fit the model
train = final_final_df
lrModel = lr.fit(train)

# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))

# Summarize the model over the training set and print out some metrics
trainingSummary = lrModel.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

Coefficients: [0.0,-1.06449397447e-06,-0.00244103246633,0.0,0.0,0.0,0.0,0.0,0.0]
Intercept: 25.74190069013414
numIterations: 11
objectiveHistory: [0.5000000000000001, 0.4999142911765882, 0.4998611581179943, 0.4998610757468472, 0.49986107450295986, 0.4998610630318895, 0.4998610567796463, 0.499861007730838, 0.49986088950678204, 0.4998608886775636, 0.4998608885466327]
+--------------------+
|           residuals|
+--------------------+
|-0.07899879220746797|
|-0.06591281999643073|
|-0.05282684778539348|
|-0.03974087557435624|
|-0.02665490336331...|
|-0.01356893115227...|
|-4.82958941244504...|
| 0.01260301326979274|
|    2.79768898548083|
| 0.03877495769186723|
| 0.05186092990290447|
| 0.06494690211394172|
| 0.07803287432498252|
| 0.09111884653601621|
| -1.5517951812529465|
|  0.1172907909580907|
| 0.13037676316912794|
| 0.14346273538016519|
| 0.15654870759120243|
| 0.16963467980224323|
+--------------------+
only showing top 20 rows

RMSE: 10.106638
r2: 0.001070


In [27]:
ml_df.show()

+------+--------------------+------+
|   rea|            features| label|
+------+--------------------+------+
|   0.0|[1000.0,1.9611231...|   0.0|
|   0.0|[1000.0,1.9621231...|   0.0|
|   0.0|[1000.0,1.9631231...|   0.0|
|   0.0|[1000.0,1.9641231...|   0.0|
|   0.0|[1000.0,1.9651231...|   0.0|
|   0.0|[1000.0,1.9661231...|   0.0|
|   0.0|[1000.0,1.9671231...|   0.0|
|   0.0|[1000.0,1.9681231...|   0.0|
| 2.772|[1000.0,1.9691231...| 2.772|
|   0.0|[1000.0,1.9701231...|   0.0|
|   0.0|[1000.0,1.9711231...|   0.0|
|   0.0|[1000.0,1.9721231...|   0.0|
|   0.0|[1000.0,1.9731231...|   0.0|
|   0.0|[1000.0,1.9741231...|   0.0|
|-1.656|[1000.0,1.9751231...|-1.656|
|   0.0|[1000.0,1.9761231...|   0.0|
|   0.0|[1000.0,1.9771231...|   0.0|
|   0.0|[1001.0,1.9781231...|   0.0|
|   0.0|[1001.0,1.9791231...|   0.0|
|   0.0|[1001.0,1.9801231...|   0.0|
+------+--------------------+------+
only showing top 20 rows



In [28]:
ml_df = ml_df.withColumn('boolean_label', ml_df.rea != 0)

In [29]:
ml_df = ml_df.withColumn('label', ml_df.boolean_label.cast('float'))

In [30]:
ml_df.show()

+------+--------------------+-----+-------------+
|   rea|            features|label|boolean_label|
+------+--------------------+-----+-------------+
|   0.0|[1000.0,1.9611231...|  0.0|        false|
|   0.0|[1000.0,1.9621231...|  0.0|        false|
|   0.0|[1000.0,1.9631231...|  0.0|        false|
|   0.0|[1000.0,1.9641231...|  0.0|        false|
|   0.0|[1000.0,1.9651231...|  0.0|        false|
|   0.0|[1000.0,1.9661231...|  0.0|        false|
|   0.0|[1000.0,1.9671231...|  0.0|        false|
|   0.0|[1000.0,1.9681231...|  0.0|        false|
| 2.772|[1000.0,1.9691231...|  1.0|         true|
|   0.0|[1000.0,1.9701231...|  0.0|        false|
|   0.0|[1000.0,1.9711231...|  0.0|        false|
|   0.0|[1000.0,1.9721231...|  0.0|        false|
|   0.0|[1000.0,1.9731231...|  0.0|        false|
|   0.0|[1000.0,1.9741231...|  0.0|        false|
|-1.656|[1000.0,1.9751231...|  1.0|         true|
|   0.0|[1000.0,1.9761231...|  0.0|        false|
|   0.0|[1000.0,1.9771231...|  0.0|        false|


In [31]:
ml_df = ml_df.drop('rea').drop('boolean_label')


In [33]:
# Split the data into train and test
splits = ml_df.randomSplit([0.6, 0.4], 12)
train = splits[0]
test = splits[1]

# specify layers for the neural network:
# input layer of size 4 (features), two intermediate of size 5 and 4
# and output of size 3 (classes)
layers = [1514, 1514, 1514, 2]

# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=10, layers=layers, blockSize=128, seed=1234)

In [39]:
# train the model
# model = trainer.fit(train)

In [None]:
# # compute accuracy on the test set
# result = model.transform(test)
# predictionAndLabels = result.select("prediction", "label")
# evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
# print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

In [None]:
import numpy as np
label_np = np.array(train.select('label').collect())

In [None]:
features_np = np.array(train.select('features').collect())

In [None]:
features_np_flat = [x[0] for x in features_np]

In [None]:
result = np.vstack(features_np_flat)

In [None]:
# For a single-input model with 2 classes (binary classification):
from keras.models import Model
from keras.layers import Input, Dense
from keras.models import Sequential
model = Sequential()
model.add(Dense(18, activation='relu', input_dim=9))
# model.add(Dense(18, activation='relu', input_dim=18))
model.add(Dense(1, input_dim = 18,  activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Generate dummy data
# import numpy as np
# data = np.random.random((1000, 100))
# labels = np.random.randint(2, size=(1000, 1))

# Train the model, iterating on the data in batches of 32 samples
model.fit(result, label_np, epochs=100, batch_size=32)

In [None]:
unique, counts = np.unique(label_np, return_counts=True)

In [None]:
unique

In [None]:
counts

In [None]:
547/(49+547)