In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import isnan, when, count, col

spark = SparkSession.builder.appName('733').getOrCreate()
sc = spark.sparkContext

In [2]:
annual_df = spark.read.csv('../annual_compustat.csv', header=True, inferSchema=True) #.limit(20000)

In [3]:
misstatements = spark.read.csv('../misstatements.csv', header=True, inferSchema=True)

In [4]:
new_df = annual_df.join(misstatements, annual_df.tic == misstatements.ticker)

In [5]:
new_df.select('tic').distinct().count()

1095

In [6]:
rea_df = annual_df.select('rea')
rea_df = rea_df.filter(rea_df.rea.isNotNull())

In [7]:
df = rea_df.select(col('rea'), rea_df.rea.cast('float').alias('float_rea'))
rea_df = rea_df.where(rea_df.rea != 0)
rea_df.show()

+-------+
|    rea|
+-------+
|  2.772|
| -1.656|
| -0.323|
|-35.188|
| -0.057|
| -0.268|
| -0.197|
| -1.484|
|   17.6|
| -158.8|
| -0.366|
| 69.802|
|115.744|
| -1.281|
| -0.225|
|  0.011|
|  0.074|
|  -0.18|
| -0.207|
| -0.041|
+-------+
only showing top 20 rows



In [8]:
data = df.select('float_rea').collect()

In [9]:
# percentage = float(raw_input('What percentage? '))
import random
import numpy

percentage = 5
k = len(data) * percentage // 100
indices = random.sample(range(len(data)), k)
new_list1 = [data[i] for i in indices]

In [10]:
len(new_list1)

14424

In [11]:
new_list1[0]
rea_vector = [x[0] for x in new_list1]

In [12]:
import matplotlib.pyplot as plt
import numpy as np
# %matplotlib inline
# x = np.random.normal(size = 1000)
plt.hist(rea_vector, bins=50)
plt.ylabel('count');

In [13]:
annual_df.show()

+-----+--------+-----+------+------+------+-------+-----+---------+--------------------+-------+-------+-------+----+------+---+----+------+-----+------+------+-------+-----+---+-----+----+----+-----+----+----+-----+----+---+--------+-----+-----+-----+----+----+-----+-----+-------+-----+------+-----+-------+-------+-----+--------+-------+--------+---------+------+-----+------+-----+----+------+------+----+-----+----+----+----+----+-----+---------+---------+-------+---------+----+----+------+-----+-----+------+----+----+-----+----+-----+----+-----+------+----+-----+-----+----+----+----+-----+-------+-----+------+----+-----+----+-----+----+-----+----+----+----+------+-----+-----+----+-----+----+------+-----+----+-----+-----+-----+-----+-----+------+----+----+-----+----+------+------+------+----+-----+----+-----+----+----+----+----+-----+-----+-----+----+----+-------+------+-------+-----+-------+-----+-------+-------+----+----+----+----+----+----+----+----+----+----+----+-----+----+----+-

In [14]:
# nullcounts = annual_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in annual_df.columns])

In [15]:
nullcounts = spark.read.csv('annual_compustat_null_count.csv', header=False)

In [16]:
nullcounts.show()

+---+---+---+---+---+---+---+---+---+---+----+----+------+------+----+----+------+------+----+-----+-----+------+----+----+-----+------+----+------+-----+------+----+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+-----+------+------+------+------+-----+------+-----+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+-----+------+------+------+-----+-----+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+-----+-----+------+-----+------+------+------+------+------+------+------+------+------+---

In [17]:
import csv

with open('annual_compustat_null_count.csv', 'r') as f:
  reader = csv.reader(f)
  your_list = list(reader)

print(your_list)

[['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '818306', '810462', '0', '0', '698054', '702977', '0', '80675', '80675', '812004', '1677', '0', '33518', '817171', '0', '353427', '39341', '817725', '8', '717552', '746149', '773515', '299177', '454021', '705981', '213877', '267111', '714259', '303335', '784224', '749514', '746693', '748961', '746753', '784292', '179047', '186448', '810415', '810825', '810899', '784279', '809542', '820413', '820413', '820412', '165814', '784246', '804711', '804855', '804467', '174806', '138557', '177875', '727129', '741433', '742850', '743217', '773264', '741564', '742519', '773235', '743117', '741047', '144712', '80361', '625074', '661421', '776101', '105874', '72763', '720269', '55940', '163696', '715392', '802179', '651160', '651197', '182983', '144506', '131427', '817887', '339136', '241545', '817752', '744674', '738782', '809866', '810452', '810484', '809359', '65843', '612144', '133261', '134083', '96792', '92473', '129968', '524366', 

In [18]:
null_count_list = your_list[0]

In [19]:
null_count_list = [float(x) for x in null_count_list]

In [20]:
good_columns = []
for i in range(0, len(null_count_list)):
    if null_count_list[i]==0:
        good_columns.append(i)
    

In [21]:
good_columns

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 18, 23, 26, 599, 601, 602]

In [22]:
columns_num = [3, 10, 14]
df2 = annual_df.select(*(annual_df.columns[i] for i in good_columns))
df2.show()

+-----+--------+-----+------+------+------+-------+-----+---------+--------------------+-------+-------+------+---+-----+---+----+------+-------+----+
|gvkey|datadate|fyear|indfmt|consol|popsrc|datafmt|  tic|    cusip|                conm|acctchg|acctstd|  ajex|ajp|curcd|fyr| ogm|prstkc|prstkpc|prvt|
+-----+--------+-----+------+------+------+-------+-----+---------+--------------------+-------+-------+------+---+-----+---+----+------+-------+----+
| 1000|19611231| 1961|  INDL|     C|     D|    STD| AE.2|000032102|A & E PLASTIK PAK...|   null|   null|3.3418|1.0|  USD| 12|null|  null|   null|null|
| 1000|19621231| 1962|  INDL|     C|     D|    STD| AE.2|000032102|A & E PLASTIK PAK...|   null|   null|3.3418|1.0|  USD| 12|null|  null|   null|null|
| 1000|19631231| 1963|  INDL|     C|     D|    STD| AE.2|000032102|A & E PLASTIK PAK...|   null|   null|3.2445|1.0|  USD| 12|null|  null|   null|null|
| 1000|19641231| 1964|  INDL|     C|     D|    STD| AE.2|000032102|A & E PLASTIK PAK...|   nul

In [23]:
df2.first()

Row(gvkey=1000, datadate=19611231, fyear=1961, indfmt='INDL', consol='C', popsrc='D', datafmt='STD', tic='AE.2', cusip='000032102', conm='A & E PLASTIK PAK INC', acctchg=None, acctstd=None, ajex=3.3418, ajp=1.0, curcd='USD', fyr=12, ogm=None, prstkc=None, prstkpc=None, prvt=None)

In [24]:
annual_df.first()

Row(gvkey=1000, datadate=19611231, fyear=1961, indfmt='INDL', consol='C', popsrc='D', datafmt='STD', tic='AE.2', cusip='000032102', conm='A & E PLASTIK PAK INC', acctchg=None, acctstd=None, acqmeth=None, adrr=None, ajex=3.3418, ajp=1.0, bspr=None, compst=None, curcd='USD', curncd='USD', currtr=1.0, curuscn=None, final='Y', fyr=12, ismod=None, ltcm=None, ogm=None, pddur=12, scf=None, src=None, stalt=None, udpl=None, upd=3, apdedate=None, fdate=None, pdate=None, acchg=None, acco=None, acdo=None, aco=None, acodo=None, acominc=None, acox=None, acoxar=None, acqao=None, acqcshi=None, acqgdwl=None, acqic=None, acqintan=None, acqinvt=None, acqlntal=None, acqniintc=None, acqppe=None, acqsc=None, act=None, adpac=None, aedi=None, afudcc=None, afudci=None, aldo=None, am=None, amc=None, amdc=None, amgw=None, ano=None, ao=None, aocidergl=None, aociother=None, aocipen=None, aocisecgl=None, aodo=None, aol2=None, aoloch=None, aox=None, ap=None, apalch=None, apb=None, apc=None, apofs=None, aqa=None, aqc

In [30]:
permuted_annual_df = annual_df.na.fill(2)
permuted_annual_df = annual_df.na.fill('4')

In [32]:
permuted_annual_df.show()

+-----+--------+-----+------+------+------+-------+-----+---------+--------------------+-------+-------+-------+----+------+---+----+------+-----+------+------+-------+-----+---+-----+----+---+-----+----+----+-----+----+---+--------+-----+-----+-----+----+----+-----+-----+-------+-----+------+-----+-------+-------+-----+--------+-------+--------+---------+------+-----+------+-----+----+------+------+----+-----+----+----+----+----+-----+---------+---------+-------+---------+----+----+------+-----+-----+------+----+----+-----+----+-----+----+-----+------+----+-----+-----+----+----+----+-----+-------+-----+------+----+-----+----+-----+----+-----+----+----+----+------+-----+-----+----+-----+----+------+-----+----+-----+-----+-----+-----+-----+------+----+----+-----+----+------+------+------+----+-----+----+-----+----+----+----+----+-----+-----+-----+----+----+-------+------+-------+-----+-------+-----+-------+-------+----+----+----+----+----+----+----+----+----+----+----+-----+----+----+--

In [33]:
some_dict = {}
for x in annual_df.columns:
    some_dict[x] = 0
some_dict

{'gvkey': 0,
 'datadate': 0,
 'fyear': 0,
 'indfmt': 0,
 'consol': 0,
 'popsrc': 0,
 'datafmt': 0,
 'tic': 0,
 'cusip': 0,
 'conm': 0,
 'acctchg': 0,
 'acctstd': 0,
 'acqmeth': 0,
 'adrr': 0,
 'ajex': 0,
 'ajp': 0,
 'bspr': 0,
 'compst': 0,
 'curcd': 0,
 'curncd': 0,
 'currtr': 0,
 'curuscn': 0,
 'final': 0,
 'fyr': 0,
 'ismod': 0,
 'ltcm': 0,
 'ogm': 0,
 'pddur': 0,
 'scf': 0,
 'src': 0,
 'stalt': 0,
 'udpl': 0,
 'upd': 0,
 'apdedate': 0,
 'fdate': 0,
 'pdate': 0,
 'acchg': 0,
 'acco': 0,
 'acdo': 0,
 'aco': 0,
 'acodo': 0,
 'acominc': 0,
 'acox': 0,
 'acoxar': 0,
 'acqao': 0,
 'acqcshi': 0,
 'acqgdwl': 0,
 'acqic': 0,
 'acqintan': 0,
 'acqinvt': 0,
 'acqlntal': 0,
 'acqniintc': 0,
 'acqppe': 0,
 'acqsc': 0,
 'act': 0,
 'adpac': 0,
 'aedi': 0,
 'afudcc': 0,
 'afudci': 0,
 'aldo': 0,
 'am': 0,
 'amc': 0,
 'amdc': 0,
 'amgw': 0,
 'ano': 0,
 'ao': 0,
 'aocidergl': 0,
 'aociother': 0,
 'aocipen': 0,
 'aocisecgl': 0,
 'aodo': 0,
 'aol2': 0,
 'aoloch': 0,
 'aox': 0,
 'ap': 0,
 'apalch': 0,


In [34]:
permuted_annual_df = annual_df.fillna(some_dict)

In [35]:
permuted_annual_df.first()

Row(gvkey=1000, datadate=19611231, fyear=1961, indfmt='INDL', consol='C', popsrc='D', datafmt='STD', tic='AE.2', cusip='000032102', conm='A & E PLASTIK PAK INC', acctchg='0', acctstd='0', acqmeth='0', adrr=0.0, ajex=3.3418, ajp=1.0, bspr='0', compst='0', curcd='USD', curncd='USD', currtr=1.0, curuscn=0.0, final='Y', fyr=12, ismod=0, ltcm=0, ogm='0', pddur=12, scf=0, src=0, stalt='0', udpl=0, upd=3, apdedate=0, fdate=0, pdate=0, acchg=0.0, acco=0.0, acdo=0.0, aco=0.0, acodo=0.0, acominc=0.0, acox=0.0, acoxar=0.0, acqao=0.0, acqcshi=0.0, acqgdwl=0.0, acqic=0.0, acqintan=0.0, acqinvt=0.0, acqlntal=0.0, acqniintc=0.0, acqppe=0.0, acqsc=0.0, act=0.0, adpac=0.0, aedi=0.0, afudcc=0.0, afudci=0.0, aldo=0.0, am=0.0, amc=0.0, amdc=0.0, amgw=0.0, ano=0.0, ao=0.0, aocidergl=0.0, aociother=0.0, aocipen=0.0, aocisecgl=0.0, aodo=0.0, aol2=0.0, aoloch=0.0, aox=0.0, ap=0.0, apalch=0.0, apb=0.0, apc=0.0, apofs=0.0, aqa=0.0, aqc=0.0, aqd=0.0, aqeps=0.0, aqi=0.0, aqp=0.0, aqpl1=0.0, aqs=0.0, arb=0.0, arc=

In [36]:
permuted_annual_df.select('conm')

DataFrame[conm: string]

In [42]:
permuted_annual_dtypes = permuted_annual_df.dtypes

In [47]:
string_columns = [k for (k,v) in permuted_annual_dtypes if v == 'string']

In [52]:
text_columns = [k for (k,v) in permuted_annual_dtypes if v == 'text']

In [53]:
text_columns

[]

In [49]:
permuted_annual_df_no_strings = permuted_annual_df.drop(*string_columns)

In [54]:
permuted_annual_df_no_strings.printSchema

<bound method DataFrame.printSchema of DataFrame[gvkey: int, datadate: int, fyear: int, adrr: double, ajex: double, ajp: double, currtr: double, curuscn: double, fyr: int, ismod: int, ltcm: int, pddur: int, scf: int, src: int, udpl: int, upd: int, apdedate: int, fdate: int, pdate: int, acchg: double, acco: double, acdo: double, aco: double, acodo: double, acominc: double, acox: double, acoxar: double, acqao: double, acqcshi: double, acqgdwl: double, acqic: double, acqintan: double, acqinvt: double, acqlntal: double, acqniintc: double, acqppe: double, acqsc: double, act: double, adpac: double, aedi: double, afudcc: double, afudci: double, aldo: double, am: double, amc: double, amdc: double, amgw: double, ano: double, ao: double, aocidergl: double, aociother: double, aocipen: double, aocisecgl: double, aodo: double, aol2: double, aoloch: double, aox: double, ap: double, apalch: double, apb: double, apc: double, apofs: double, aqa: double, aqc: double, aqd: double, aqeps: double, aqi: dou

In [50]:
permuted_annual_df_no_strings.printSchema

<bound method DataFrame.printSchema of DataFrame[gvkey: int, datadate: int, fyear: int, adrr: double, ajex: double, ajp: double, currtr: double, curuscn: double, fyr: int, ismod: int, ltcm: int, pddur: int, scf: int, src: int, udpl: int, upd: int, apdedate: int, fdate: int, pdate: int, acchg: double, acco: double, acdo: double, aco: double, acodo: double, acominc: double, acox: double, acoxar: double, acqao: double, acqcshi: double, acqgdwl: double, acqic: double, acqintan: double, acqinvt: double, acqlntal: double, acqniintc: double, acqppe: double, acqsc: double, act: double, adpac: double, aedi: double, afudcc: double, afudci: double, aldo: double, am: double, amc: double, amdc: double, amgw: double, ano: double, ao: double, aocidergl: double, aociother: double, aocipen: double, aocisecgl: double, aodo: double, aol2: double, aoloch: double, aox: double, ap: double, apalch: double, apb: double, apc: double, apofs: double, aqa: double, aqc: double, aqd: double, aqeps: double, aqi: dou

In [56]:
permuted_annual_df_no_strings.select('rea').show()

+------+
|   rea|
+------+
|   0.0|
|   0.0|
|   0.0|
|   0.0|
|   0.0|
|   0.0|
|   0.0|
|   0.0|
| 2.772|
|   0.0|
|   0.0|
|   0.0|
|   0.0|
|   0.0|
|-1.656|
|   0.0|
|   0.0|
|   0.0|
|   0.0|
|   0.0|
+------+
only showing top 20 rows



In [63]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Load training data
# data = spark.read.format("libsvm")\
#     .load("data/mllib/sample_multiclass_classification_data.txt")

data = permuted_annual_df_no_strings

from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=data.columns, outputCol="features")

# encoded_df_with_indexed_bar = (vector_indexer
#     .fit(encoded_df)
#     .transform(encoded_df))

final_df = assembler.transform(data)

In [65]:
final_df.show()

+-----+--------+-----+----+------+---+------+-------+---+-----+----+-----+---+---+----+---+--------+-----+-----+-----+----+----+-----+-----+-------+-----+------+-----+-------+-------+-----+--------+-------+--------+---------+------+-----+------+-----+----+------+------+----+-----+---+----+----+---+-----+---------+---------+-------+---------+----+----+------+-----+-----+------+---+---+-----+---+-----+---+-----+------+---+-----+-----+---+---+----+-----+-------+-----+------+----+-----+----+-----+----+-----+----+----+----+------+-----+-----+---+-----+----+------+-----+---+-----+-----+-----+-----+-----+------+---+---+-----+----+------+------+------+----+-----+---+-----+---+----+----+----+-----+-----+-----+---+---+-------+------+-------+-----+-------+-----+-------+-------+----+----+----+----+----+----+---+----+---+----+---+-----+---+---+------+------+-----+-----+-----+-----+-----+---+------+-----+----+-----+------+-----+-----+-----+------+-----+-----+-----+------+-----+---+---+-----+----+---

In [80]:
feature_columns = [item for item in final_df.columns if item not in ['rea', 'features']]

In [81]:
final_final_df = final_df.drop(*feature_columns)

In [82]:
final_final_df.show()

+------+--------------------+
|   rea|            features|
+------+--------------------+
|   0.0|(1515,[0,1,2,4,5,...|
|   0.0|(1515,[0,1,2,4,5,...|
|   0.0|(1515,[0,1,2,4,5,...|
|   0.0|(1515,[0,1,2,4,5,...|
|   0.0|(1515,[0,1,2,4,5,...|
|   0.0|(1515,[0,1,2,4,5,...|
|   0.0|(1515,[0,1,2,4,5,...|
|   0.0|(1515,[0,1,2,4,5,...|
| 2.772|(1515,[0,1,2,4,5,...|
|   0.0|(1515,[0,1,2,4,5,...|
|   0.0|(1515,[0,1,2,4,5,...|
|   0.0|(1515,[0,1,2,4,5,...|
|   0.0|(1515,[0,1,2,4,5,...|
|   0.0|(1515,[0,1,2,4,5,...|
|-1.656|(1515,[0,1,2,4,5,...|
|   0.0|(1515,[0,1,2,4,5,...|
|   0.0|(1515,[0,1,2,4,5,...|
|   0.0|(1515,[0,1,2,4,5,...|
|   0.0|(1515,[0,1,2,4,5,...|
|   0.0|(1515,[0,1,2,4,5,...|
+------+--------------------+
only showing top 20 rows



In [89]:
final_final_df = final_final_df.withColumn('label', final_final_df.rea)

In [90]:
final_final_df.show()

+------+--------------------+------+
|   rea|            features| label|
+------+--------------------+------+
|   0.0|(1515,[0,1,2,4,5,...|   0.0|
|   0.0|(1515,[0,1,2,4,5,...|   0.0|
|   0.0|(1515,[0,1,2,4,5,...|   0.0|
|   0.0|(1515,[0,1,2,4,5,...|   0.0|
|   0.0|(1515,[0,1,2,4,5,...|   0.0|
|   0.0|(1515,[0,1,2,4,5,...|   0.0|
|   0.0|(1515,[0,1,2,4,5,...|   0.0|
|   0.0|(1515,[0,1,2,4,5,...|   0.0|
| 2.772|(1515,[0,1,2,4,5,...| 2.772|
|   0.0|(1515,[0,1,2,4,5,...|   0.0|
|   0.0|(1515,[0,1,2,4,5,...|   0.0|
|   0.0|(1515,[0,1,2,4,5,...|   0.0|
|   0.0|(1515,[0,1,2,4,5,...|   0.0|
|   0.0|(1515,[0,1,2,4,5,...|   0.0|
|-1.656|(1515,[0,1,2,4,5,...|-1.656|
|   0.0|(1515,[0,1,2,4,5,...|   0.0|
|   0.0|(1515,[0,1,2,4,5,...|   0.0|
|   0.0|(1515,[0,1,2,4,5,...|   0.0|
|   0.0|(1515,[0,1,2,4,5,...|   0.0|
|   0.0|(1515,[0,1,2,4,5,...|   0.0|
+------+--------------------+------+
only showing top 20 rows



In [97]:
len(final_final_df.select('features').first()[0])

1515

In [91]:
# Split the data into train and test
splits = final_final_df.randomSplit([0.6, 0.4], 1234)
train = splits[0]
test = splits[1]

# specify layers for the neural network:
# input layer of size 4 (features), two intermediate of size 5 and 4
# and output of size 3 (classes)
layers = [4, 5, 4, 3]

# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)

# train the model
model = trainer.fit(train)

# compute accuracy on the test set
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

Py4JJavaError: An error occurred while calling o985.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 6 in stage 29.0 failed 1 times, most recent failure: Lost task 6.0 in stage 29.0 (TID 270, localhost, executor driver): java.lang.ArrayIndexOutOfBoundsException: -5969
	at org.apache.spark.ml.classification.LabelConverter$.encodeLabeledPoint(MultilayerPerceptronClassifier.scala:121)
	at org.apache.spark.ml.classification.MultilayerPerceptronClassifier$$anonfun$3.apply(MultilayerPerceptronClassifier.scala:245)
	at org.apache.spark.ml.classification.MultilayerPerceptronClassifier$$anonfun$3.apply(MultilayerPerceptronClassifier.scala:245)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
	at scala.collection.Iterator$GroupedIterator.takeDestructively(Iterator.scala:1076)
	at scala.collection.Iterator$GroupedIterator.go(Iterator.scala:1091)
	at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1128)
	at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1132)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:215)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1038)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1029)
	at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:969)
	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1029)
	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:760)
	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:334)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:285)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1499)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1487)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1486)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1486)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1714)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1669)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1658)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2022)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2043)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2062)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2087)
	at org.apache.spark.rdd.RDD.count(RDD.scala:1158)
	at org.apache.spark.mllib.optimization.LBFGS$.runLBFGS(LBFGS.scala:195)
	at org.apache.spark.mllib.optimization.LBFGS.optimize(LBFGS.scala:142)
	at org.apache.spark.ml.ann.FeedForwardTrainer.train(Layer.scala:817)
	at org.apache.spark.ml.classification.MultilayerPerceptronClassifier.train(MultilayerPerceptronClassifier.scala:267)
	at org.apache.spark.ml.classification.MultilayerPerceptronClassifier.train(MultilayerPerceptronClassifier.scala:145)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:82)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ArrayIndexOutOfBoundsException: -5969
	at org.apache.spark.ml.classification.LabelConverter$.encodeLabeledPoint(MultilayerPerceptronClassifier.scala:121)
	at org.apache.spark.ml.classification.MultilayerPerceptronClassifier$$anonfun$3.apply(MultilayerPerceptronClassifier.scala:245)
	at org.apache.spark.ml.classification.MultilayerPerceptronClassifier$$anonfun$3.apply(MultilayerPerceptronClassifier.scala:245)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
	at scala.collection.Iterator$GroupedIterator.takeDestructively(Iterator.scala:1076)
	at scala.collection.Iterator$GroupedIterator.go(Iterator.scala:1091)
	at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1128)
	at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1132)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:215)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1038)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1029)
	at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:969)
	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1029)
	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:760)
	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:334)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:285)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [58]:
#based on https://spark.apache.org/docs/2.2.0/ml-tuning.html

from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder


# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="aedi", outputCol="rea")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

#paramgrid
paramGrid = ParamGridBuilder() \
    .addGrid(hashingTF.numFeatures, [1000, 5000, 10000]) \
    .addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]) \
    .build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=2) 

# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(permuted_annual_df_no_strings)

# Make predictions on test documents. cvModel uses the best model found (lrModel).
prediction = cvModel.transform(permuted_annual_df)
selected = prediction.select("id", "text", "probability", "prediction")
# selected = prediction.select("id", "probability", "prediction")

IllegalArgumentException: 'requirement failed: Input type must be string type but got DoubleType.'

In [None]:
v2 = annual_df.first()['nim']

In [None]:
print(v2)

In [None]:
some_value = permuted_annual_df.first()['iseqm']

In [None]:
type(some_value)

In [None]:
annual_df.na

In [None]:
df = spark.createDataFrame([(None,), (2,), (3,), (None,)], ['col'])
df.show()

In [None]:
val = df.first()['col']

In [None]:
type(val)

In [None]:
df.na.fill(0).show()