In [1]:
#https://spark.apache.org/docs/latest/sql-data-sources-json.html
# uses jsonl format (json lines)

In [2]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, Word2Vec, StopWordsRemover
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.ml.classification import LogisticRegression
import pyspark.sql.functions as F
import pyspark.sql.types as T

sc = SparkContext('local')
spark = SparkSession(sc)

In [3]:
df = spark.read.json("caselaw_data/new_mexico_data.jsonl")

In [4]:
df.printSchema()

root
 |-- casebody: struct (nullable = true)
 |    |-- data: struct (nullable = true)
 |    |    |-- attorneys: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- head_matter: string (nullable = true)
 |    |    |-- judges: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- opinions: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- author: string (nullable = true)
 |    |    |    |    |-- text: string (nullable = true)
 |    |    |    |    |-- type: string (nullable = true)
 |    |    |-- parties: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |-- status: string (nullable = true)
 |-- citations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- cite: string (nullable = true)
 |    |    |-- type: string (nullable = true)
 |-- court: struct (nullable = true)
 |   

In [5]:
df.createOrReplaceTempView("df")

In [6]:
id_df = spark.sql("SELECT casebody.data.opinions.text FROM df LIMIT 1")

In [7]:
#id_df.show()

In [8]:
id_df.toPandas().iloc[0,0]

['OPINION\nWECHSLER, Chief Judge.\n{1} Plaintiffs appeal from a district court order awarding summary judgment in favor of Defendants and denying Plaintiffs’ request to amend their complaint. Based upon our determination that the doctrine of merger is inapplicable under the circumstances presented by this case, the district court properly concluded that the bankruptcy documents should be construed together. In addition, we decline to remand this case to the district court for consideration of the issue of the parties’ intent. The district court did not abuse its discretion in denying Plaintiffs’ motion to amend. We therefore affirm.\nBackground\n{2} Angel Fire is a resort community located in Colfax County. Owners of real property within the resort area are assessed annual fees, the amount of which varies depending upon the nature of the property (e.g., residential or commercial; developed or undeveloped). In exchange for payment of the annual dues assessment, property owners receive a

In [9]:
id_df = id_df.withColumn("all_text", F.explode("text"))

In [10]:
id_df.createOrReplaceTempView("id_df")

In [11]:
all_df = spark.sql("select all_text from id_df")

In [12]:
all_df.toPandas()['all_text'][0]

'OPINION\nWECHSLER, Chief Judge.\n{1} Plaintiffs appeal from a district court order awarding summary judgment in favor of Defendants and denying Plaintiffs’ request to amend their complaint. Based upon our determination that the doctrine of merger is inapplicable under the circumstances presented by this case, the district court properly concluded that the bankruptcy documents should be construed together. In addition, we decline to remand this case to the district court for consideration of the issue of the parties’ intent. The district court did not abuse its discretion in denying Plaintiffs’ motion to amend. We therefore affirm.\nBackground\n{2} Angel Fire is a resort community located in Colfax County. Owners of real property within the resort area are assessed annual fees, the amount of which varies depending upon the nature of the property (e.g., residential or commercial; developed or undeveloped). In exchange for payment of the annual dues assessment, property owners receive a 

In [13]:
tokenizer = Tokenizer(inputCol="all_text", outputCol="words")
wordsData = tokenizer.transform(all_df)
wordsData.show(5)

+--------------------+--------------------+
|            all_text|               words|
+--------------------+--------------------+
|OPINION
WECHSLER,...|[opinion, wechsle...|
+--------------------+--------------------+



In [14]:
#https://spark.apache.org/docs/2.2.0/ml-features.html#stopwordsremover
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
remover.transform(wordsData).show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [15]:
wordsData.toPandas()['words'][0]

['opinion',
 'wechsler,',
 'chief',
 'judge.',
 '{1}',
 'plaintiffs',
 'appeal',
 'from',
 'a',
 'district',
 'court',
 'order',
 'awarding',
 'summary',
 'judgment',
 'in',
 'favor',
 'of',
 'defendants',
 'and',
 'denying',
 'plaintiffs’',
 'request',
 'to',
 'amend',
 'their',
 'complaint.',
 'based',
 'upon',
 'our',
 'determination',
 'that',
 'the',
 'doctrine',
 'of',
 'merger',
 'is',
 'inapplicable',
 'under',
 'the',
 'circumstances',
 'presented',
 'by',
 'this',
 'case,',
 'the',
 'district',
 'court',
 'properly',
 'concluded',
 'that',
 'the',
 'bankruptcy',
 'documents',
 'should',
 'be',
 'construed',
 'together.',
 'in',
 'addition,',
 'we',
 'decline',
 'to',
 'remand',
 'this',
 'case',
 'to',
 'the',
 'district',
 'court',
 'for',
 'consideration',
 'of',
 'the',
 'issue',
 'of',
 'the',
 'parties’',
 'intent.',
 'the',
 'district',
 'court',
 'did',
 'not',
 'abuse',
 'its',
 'discretion',
 'in',
 'denying',
 'plaintiffs’',
 'motion',
 'to',
 'amend.',
 'we',
 'the

In [16]:
word2Vec = Word2Vec(inputCol="words", outputCol="result")

In [17]:
model = word2Vec.fit(wordsData)

In [18]:
model.getVectors().show(100)

+--------------+--------------------+
|          word|              vector|
+--------------+--------------------+
| specifically,|[-0.0129815125837...|
|        assert|[-0.0040904274210...|
|          read|[-0.0221348684281...|
|           for|[-0.0547507852315...|
|           any|[-0.0081878053024...|
|        review|[-0.0326722413301...|
|       parties|[-0.0230945013463...|
|         court|[-0.0074566327966...|
|   plaintiffs’|[-0.0113282604143...|
|          this|[-0.0308493226766...|
|            in|[-0.0713620856404...|
|          have|[0.01049211528152...|
|         aafpo|[-0.0074017094448...|
|           are|[-0.0082204807549...|
|            is|[-0.0632997602224...|
|          p.2d|[-0.0891183391213...|
|     addition,|[0.01015991810709...|
|         plan,|[-0.0243973527103...|
|   declaration|[-0.0245011355727...|
|         given|[-0.0156992431730...|
|          real|[0.00299219577573...|
|  supplemental|[-0.0161659754812...|
|       denying|[0.02711229026317...|
|          f