In [4]:
import pandas as pd
import bz2
import json
import tld
from tld import get_tld

In [5]:
!pip install findspark
!pip install pyspark



### Objectives M2
- That you can handle the data in its size.
- That you understand what’s in the data (formats, distributions, missing values, correlations, etc.).
- That you considered ways to enrich, filter, transform the data according to your needs.
- That you have a reasonable plan and ideas for methods you’re going to use, giving their essential mathematical details in the notebook.
- That your plan for analysis and communication is reasonable and sound, potentially discussing alternatives to your choices that you considered but dropped.

#### Test with DataFrame (DON'T RUN THIS CELL)

In [None]:
%%time
df_quotes_2020 = pd.read_json('data/quotes-2020.json.bz2', compression='bz2',lines=True)

In [None]:
def get_domain(url):
    res = get_tld(url, as_object=True)
    return res.tld

In [None]:
path_to_file = 'data/quotes-2020-domains.json.bz2'
number = 0
df = pd.DataFrame(columns=['quoteID', 'quotation', 'speaker', 'qids', 'date', 'numOccurrences',
       'probas', 'urls', 'phase', 'domains'])
with bz2.open(path_to_file, 'rb') as s_file:
    for instance in s_file:
        if number == 10:
            break  
        df = pd.concat([df, pd.read_json(instance,lines=True)], ignore_index=True)    


### Summary Columns
- **quoteID**:      Primary key of the quotation (format: "YYYY-MM-DD-{increasing int:06d}")
- **quotation**:    Text of the longest encountered original form of the quotation
- **speaker**:      Selected most likely speaker
- **qids**:         Wikidata IDs of all aliases that match the selected speaker
- **date**:         Earliest occurrence date of any version of the quotation
- **numOccurences**:Number of time this quotation occurs in the articles
- **probas**:       Array representing the probabilities of each speaker having uttered the quotation
- **urls**:         List of links to the original articles containing the quotation
- **phase**:        Corresponding phase of the data in which the quotation first occurred (A-E)
- **domains**:      Domain of the URL 


In [None]:
df.info()

In [None]:
df.head(10)

### Test with PySpark

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import explode

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

# Read JSON file into dataframe
df = spark.read.json('data/quotes-2020.json.bz2')
df.printSchema()
df.show()

root
 |-- date: string (nullable = true)
 |-- numOccurrences: long (nullable = true)
 |-- phase: string (nullable = true)
 |-- probas: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- qids: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- quotation: string (nullable = true)
 |-- quoteID: string (nullable = true)
 |-- speaker: string (nullable = true)
 |-- urls: array (nullable = true)
 |    |-- element: string (containsNull = true)

+-------------------+--------------+-----+--------------------+--------------------+--------------------+-----------------+-------------------+--------------------+
|               date|numOccurrences|phase|              probas|                qids|           quotation|          quoteID|            speaker|                urls|
+-------------------+--------------+-----+--------------------+--------------------+--------------------+-----------------+--

### Analysis 

- quids same for each quote 
- quids pas pour None
- check proba avec le speaker 
- check chaque colonne
- verifier l'URL 