### Read input documents

In [2]:
docs = []

import os
dname = "/home/eolus/Desktop/Dauphine/bigdata/tfidf/data"
for fname in os.listdir(dname):
    fpath = "{}/{}".format(dname, fname)
    try:
        with open(fpath, 'r') as fp:
            doc = fp.read()
            doc_id = int(fname.replace(".txt", ""))
            docs.append( (doc_id, doc) )
    except Exception as e:
        print(e)

### Spark session

In [3]:
from pyspark.sql import SparkSession, SQLContext

sc = SparkSession\
        .builder\
        .appName("TfIdf Example")\
        .getOrCreate()
        
sql = SQLContext(sc)

### Load input as spark df

In [4]:
documents = sql.createDataFrame(docs, [ "doc_id", "doc_text" ])

Unnamed: 0,doc_id,doc_text
0,978796580789719042,RT @BarrettSallee: Hero https://t.co/4eyjbjtqKp
1,978796298328559616,"Of the 6 new announced #Russianmissiles, actua..."
2,978797301639475200,RT @Ohsehunbar: 2018 SEHUN’s bday support\n[9t...
3,978796516310577153,Pekan Suci Paroki Banteng https://t.co/JbSatyISCW
4,978797071414145030,RT @igot_markedby7: [TRANS] #Jackson replied:\...


### Tokenize

In [10]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer

tokenizer = RegexTokenizer(inputCol="doc_text", outputCol="words", pattern="\\W")
#tokenizer = Tokenizer(inputCol="doc_text", outputCol="words")

#from pyspark.sql.functions import col, udf
#from pyspark.sql.types import IntegerType
#countTokens = udf(lambda words: len(words), IntegerType())

tokenized = tokenizer.transform(documents)
tokenized.select("doc_text", "words").toPandas().head()

Unnamed: 0,doc_text,words
0,RT @BarrettSallee: Hero https://t.co/4eyjbjtqKp,"[rt, barrettsallee, hero, https, t, co, 4eyjbj..."
1,"Of the 6 new announced #Russianmissiles, actua...","[of, the, 6, new, announced, russianmissiles, ..."
2,RT @Ohsehunbar: 2018 SEHUN’s bday support\n[9t...,"[rt, ohsehunbar, 2018, sehun, s, bday, support..."
3,Pekan Suci Paroki Banteng https://t.co/JbSatyISCW,"[pekan, suci, paroki, banteng, https, t, co, j..."
4,RT @igot_markedby7: [TRANS] #Jackson replied:\...,"[rt, igot_markedby7, trans, jackson, replied, ..."


### TF

In [16]:
from pyspark.ml.feature import HashingTF

htf= HashingTF(inputCol="words", outputCol="tf")
tf = htf.transform(tokenized)
tf.toPandas().head()

Unnamed: 0,doc_id,doc_text,words,tf
0,978796580789719042,RT @BarrettSallee: Hero https://t.co/4eyjbjtqKp,"[rt, barrettsallee, hero, https, t, co, 4eyjbj...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,978796298328559616,"Of the 6 new announced #Russianmissiles, actua...","[of, the, 6, new, announced, russianmissiles, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,978797301639475200,RT @Ohsehunbar: 2018 SEHUN’s bday support\n[9t...,"[rt, ohsehunbar, 2018, sehun, s, bday, support...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,978796516310577153,Pekan Suci Paroki Banteng https://t.co/JbSatyISCW,"[pekan, suci, paroki, banteng, https, t, co, j...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,978797071414145030,RT @igot_markedby7: [TRANS] #Jackson replied:\...,"[rt, igot_markedby7, trans, jackson, replied, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


### IDF

In [17]:
from pyspark.ml.feature import IDF

idf = IDF(inputCol="tf",outputCol="idf")
tfidf = idf.fit(tf).transform(tf)
tfidf.toPandas().head()

Unnamed: 0,doc_id,doc_text,words,tf,idf
0,978796580789719042,RT @BarrettSallee: Hero https://t.co/4eyjbjtqKp,"[rt, barrettsallee, hero, https, t, co, 4eyjbj...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,978796298328559616,"Of the 6 new announced #Russianmissiles, actua...","[of, the, 6, new, announced, russianmissiles, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,978797301639475200,RT @Ohsehunbar: 2018 SEHUN’s bday support\n[9t...,"[rt, ohsehunbar, 2018, sehun, s, bday, support...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,978796516310577153,Pekan Suci Paroki Banteng https://t.co/JbSatyISCW,"[pekan, suci, paroki, banteng, https, t, co, j...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,978797071414145030,RT @igot_markedby7: [TRANS] #Jackson replied:\...,"[rt, igot_markedby7, trans, jackson, replied, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
