In [1]:
import json
import pandas as pd

In [2]:
FOLDER = "./data/"

In [3]:
with open(FOLDER+"ugcContributions.json") as f:
    contribData = json.load(f)
contribData

{'itMakesMeFeel_emotions': {'HoNIcWZY': [{'id': '6364f4e9ad16cc2ce70b6f0f',
    'userid': 'HoNIcWZY',
    'origin': '35330',
    'source_id': 'fake90e6d701748f08514b01',
    'source': 'fake',
    'pname': 'itMakesMeFeel.emotions',
    'pvalue': '',
    'context': 'application',
    'datapoints': 0,
    'category': 'interest'},
   {'id': '6364f4e9ad16cc2ce70b6f0f',
    'userid': 'HoNIcWZY',
    'origin': '35229',
    'source_id': 'fake90e6d701748f08514b01',
    'source': 'fake',
    'pname': 'itMakesMeFeel.emotions',
    'pvalue': '',
    'context': 'application',
    'datapoints': 0,
    'category': 'interest'},
   {'id': '6364f4e9ad16cc2ce70b6f0f',
    'userid': 'HoNIcWZY',
    'origin': '35380',
    'source_id': 'fake90e6d701748f08514b01',
    'source': 'fake',
    'pname': 'itMakesMeFeel.emotions',
    'pvalue': '',
    'context': 'application',
    'datapoints': 0,
    'category': 'interest'}],
  'e4aM9WL7': [{'id': '622764d6efc0335e1948a813',
    'userid': 'e4aM9WL7',
    'origin'

In [4]:
suffix = ["_emotions", "_sentiment", "_text"]
prefix = ["itMakesMeFeel", "itRemindsMeOf", "itMakesMeThinkAbout"]

# Join dataframes


In [5]:
def createDataframe(dfname):
    """Creates a dataframe using the data
    in contribData below the key dfname
    """
    data = []
    for user in contribData[dfname]:    
        data.extend(contribData[dfname][user])
    df = pd.DataFrame(data)
    df['pvalue'] = df['pvalue'].apply(lambda v: {} if (v=='') else v)
    return df


In [6]:
dataframes = {}
for p in prefix:
    for s in suffix:
        name = p+s
        dataframes[name] = createDataframe(name)


In [7]:
def joinDict(d1,d2):
    """
    Joins two dictionaries. If dictionaries have the same key, then values are merged in a list
    """
    joined = {}
    keys = set(d1.keys()).union(set(d2.keys()))
    for key in keys:
        if key in d1:            
            if key in d2:
                if type(d1[key])==list:
                    d1[key].append(d2[key])
                    joined[key] = d1[key]
                else:
                    joined[key] = [d1[key],d2[key]]
            else: # key in d1 but not in d2
                # If the value is a list then remains as a list, else creates a list with this value.
                joined[key] = d1[key] if type(d1[key])==list else [d1[key]]
        else: # key not in d1
            # If the value is a list then remains as a list, else creates a list with this value.
            joined[key] = d2[key] if type(d2[key])==list else [d2[key]]
    return joined

In [8]:
def joinDataframes( aDfList , joinFunction = joinDict):
    """
    Joins a list of dataframes using userid and origin
    A joinFunction is employed to merge the pvalues of sucessive joins. The result of all these joins
    are stored in "agg_pvalue"
    """
    assert(len(aDfList)>0)
    columns = ["id","userid","origin","pname","pvalue"]
    if len(aDfList) >= 2:
        # First pass: merge the first two dataframes and creates the "agg_pvalue"
        i=1
        join = pd.merge(aDfList[i-1][columns],aDfList[i][columns],on=["userid","origin"], suffixes=(f'_{i-1}',f'_{i}'), how="outer")  
        join["agg_pvalue"]=join.apply(lambda row: joinFunction(row[f'pvalue_{i-1}'], row[f'pvalue_{i}']), axis=1)
        i+=1
        while i<len(aDfList):
            # Succesive dataframes are merged into "agg_pvalue"
            join = pd.merge(join,aDfList[i][columns],on=["userid","origin"], suffixes=(f'_{i-1}',f'_{i}'), how="outer")
            join.rename(columns={"pname": f'pname_{i}', "pvalue": f'pvalue_{i}'}, inplace=True)
            join["agg_pvalue"]=join.apply(lambda row: joinFunction(row["agg_pvalue"], row[f'pvalue_{i}']), axis=1)
            i+=1
        return join            
    else:
        return aDfList[0]

In [9]:
def avgForKeys(aDict):
    """Transforms all the list values in a dictionary to a single value that represents
    the average value of the list 
    """
    newDict = {}
    for key in aDict:
        if aDict[key] != None:
            if len(aDict[key]) == 1:
                newDict[key] = aDict[key][0]
            else:
                newDict[key] = sum(aDict[key])/len(aDict[key])
    return newDict


## Emotions


In [10]:
# Create the aggregated dataframe for emotions
dfToJoin = [dataframes[p+"_emotions"] for p in prefix]
dfEmotions = joinDataframes(dfToJoin)


In [11]:
dfEmotions["agg_pvalue"] = dfEmotions["agg_pvalue"].apply(avgForKeys)
dfEmotions["agg_pvalue"].value_counts()

{}                                                                                        90
{'Joy': 1.0}                                                                               5
{'Fear': 1.0}                                                                              3
{'Sadness': 1.0}                                                                           3
{'Serenity': 1.0}                                                                          3
                                                                                          ..
{'Sadness': 1.0, 'Disapproval': 1.0, 'Anger': 1.0, 'Anticipation': 1.8508643500000002}     1
{'Sadness': 1.5602488}                                                                     1
{'Sadness': 1.0, 'Trust': 0.76917523, 'Interest': 1.0, 'Joy': 0.73672265}                  1
{'Joy': 1.0, 'Serenity': 1.0}                                                              1
{'Disapproval': 0.60185623, 'Love': 2.0}                              

In [12]:
cleanContrib = dfEmotions[(dfEmotions["agg_pvalue"]!={}) & (dfEmotions["agg_pvalue"]!={'False': 1.0}) ]
len(cleanContrib)

85

In [13]:
len(cleanContrib["userid"].unique())

48

In [14]:
len(cleanContrib["origin"].unique())

41

In [15]:
#pd.set_option('display.max_rows', None)
dfEmotions

Unnamed: 0,id_0,userid,origin,pname_0,pvalue_0,id_1,pname_1,pvalue_1,agg_pvalue,id,pname_2,pvalue_2
0,6364f4e9ad16cc2ce70b6f0f,HoNIcWZY,35330,itMakesMeFeel.emotions,{},6364f4e9ad16cc2ce70b6f0f,itRemindsMeOf.emotions,{},{'Anticipation': 1.0},6364f4e9ad16cc2ce70b6f0f,itMakesMeThinkAbout.emotions,{'Anticipation': 1.0}
1,6364f4e9ad16cc2ce70b6f0f,HoNIcWZY,35229,itMakesMeFeel.emotions,{},6364f4e9ad16cc2ce70b6f0f,itRemindsMeOf.emotions,{},{},6364f4e9ad16cc2ce70b6f0f,itMakesMeThinkAbout.emotions,{}
2,6364f4e9ad16cc2ce70b6f0f,HoNIcWZY,35380,itMakesMeFeel.emotions,{},6364f4e9ad16cc2ce70b6f0f,itRemindsMeOf.emotions,{},{},6364f4e9ad16cc2ce70b6f0f,itMakesMeThinkAbout.emotions,{}
3,622764d6efc0335e1948a813,e4aM9WL7,35229,itMakesMeFeel.emotions,{},622764d6efc0335e1948a813,itRemindsMeOf.emotions,{'Joy': 1.6366995},{'Joy': 1.6366995},622764d6efc0335e1948a813,itMakesMeThinkAbout.emotions,{}
4,622764d6efc0335e1948a813,e4aM9WL7,39175,itMakesMeFeel.emotions,{},622764d6efc0335e1948a813,itRemindsMeOf.emotions,{},{},622764d6efc0335e1948a813,itMakesMeThinkAbout.emotions,{}
...,...,...,...,...,...,...,...,...,...,...,...,...
170,633856a77043f23cb5236441,onDqgFnI,35362,itMakesMeFeel.emotions,{},633856a77043f23cb5236441,itRemindsMeOf.emotions,{},{},633856a77043f23cb5236441,itMakesMeThinkAbout.emotions,{}
171,633856a77043f23cb5236441,onDqgFnI,39416,itMakesMeFeel.emotions,{},633856a77043f23cb5236441,itRemindsMeOf.emotions,{},{},633856a77043f23cb5236441,itMakesMeThinkAbout.emotions,{}
172,633856a77043f23cb5236441,onDqgFnI,35229,itMakesMeFeel.emotions,{},633856a77043f23cb5236441,itRemindsMeOf.emotions,{},{},633856a77043f23cb5236441,itMakesMeThinkAbout.emotions,{}
173,6364f959ec3ec867f01140fb,oTT3YRMN,35364,itMakesMeFeel.emotions,{},6364f959ec3ec867f01140fb,itRemindsMeOf.emotions,{},{'Joy': 0.6227641},6364f959ec3ec867f01140fb,itMakesMeThinkAbout.emotions,{'Joy': 0.6227641}


## Sentiment


In [16]:
# Create the aggregated dataframe for sentiment
dfToJoin = [dataframes[p+"_sentiment"] for p in prefix]
dfSentiment = joinDataframes(dfToJoin)

In [17]:
dfSentiment["agg_pvalue"] = dfSentiment["agg_pvalue"].apply(avgForKeys)
dfSentiment["agg_pvalue"].value_counts()

{}                                      50
{'Positive': 0.7475769}                  8
{'Neutral': 1}                           6
{'Positive': 1.0}                        6
{'Positive': 1.0312800500000001}         4
                                        ..
{'Positive': 0.9855656}                  1
{'Positive': 1.92746556}                 1
{'Positive': 0.926056, 'Neutral': 1}     1
{'Positive': 0.91901934}                 1
{'Positive': 1.6187261}                  1
Name: agg_pvalue, Length: 99, dtype: int64

In [18]:
def joinString (s1, s2):
    """Joins two strings (ignoring empty strings)
    """
    s1 = "" if s1 == {} else s1
    s2 = "" if s2 == {} else s2
    if s1 == "":
        if s2 == "":
            return ""
        else:
            return s2
    else:
        if s2 == "":
            return s1
        else:
            return s1+", "+s2


## Text

In [19]:
# Create the aggregated dataframe for the texts        
dfToJoin = [dataframes[p+"_text"] for p in prefix]
dfText = joinDataframes(dfToJoin, joinString)

In [20]:
#pd.set_option('display.max_rows', None)
#dfText['agg_pvalue'] = dfText['agg_pvalue'].apply(lambda text: text.encode("ascii", "ignore").decode())
dfText

Unnamed: 0,id_0,userid,origin,pname_0,pvalue_0,id_1,pname_1,pvalue_1,agg_pvalue,id,pname_2,pvalue_2
0,6364f4e9ad16cc2ce70b6f0f,HoNIcWZY,35330,itMakesMeFeel.text,{},6364f4e9ad16cc2ce70b6f0f,itRemindsMeOf.text,{},Prospettive,6364f4e9ad16cc2ce70b6f0f,itMakesMeThinkAbout.text,Prospettive
1,6364f4e9ad16cc2ce70b6f0f,HoNIcWZY,35229,itMakesMeFeel.text,{},6364f4e9ad16cc2ce70b6f0f,itRemindsMeOf.text,{},Mondi diversi,6364f4e9ad16cc2ce70b6f0f,itMakesMeThinkAbout.text,Mondi diversi
2,6364f4e9ad16cc2ce70b6f0f,HoNIcWZY,35380,itMakesMeFeel.text,{},6364f4e9ad16cc2ce70b6f0f,itRemindsMeOf.text,{},(forse) Onirici,6364f4e9ad16cc2ce70b6f0f,itMakesMeThinkAbout.text,(forse) Onirici
3,622764d6efc0335e1948a813,e4aM9WL7,35229,itMakesMeFeel.text,{},622764d6efc0335e1948a813,itRemindsMeOf.text,al mare\n\n,al mare\n\n,622764d6efc0335e1948a813,itMakesMeThinkAbout.text,{}
4,622764d6efc0335e1948a813,e4aM9WL7,39175,itMakesMeFeel.text,{},622764d6efc0335e1948a813,itRemindsMeOf.text,malattia\n,malattia\n,622764d6efc0335e1948a813,itMakesMeThinkAbout.text,{}
...,...,...,...,...,...,...,...,...,...,...,...,...
170,633856a77043f23cb5236441,onDqgFnI,35362,itMakesMeFeel.text,{},633856a77043f23cb5236441,itRemindsMeOf.text,{},,633856a77043f23cb5236441,itMakesMeThinkAbout.text,{}
171,633856a77043f23cb5236441,onDqgFnI,39416,itMakesMeFeel.text,{},633856a77043f23cb5236441,itRemindsMeOf.text,{},,633856a77043f23cb5236441,itMakesMeThinkAbout.text,{}
172,633856a77043f23cb5236441,onDqgFnI,35229,itMakesMeFeel.text,{},633856a77043f23cb5236441,itRemindsMeOf.text,{},,633856a77043f23cb5236441,itMakesMeThinkAbout.text,{}
173,6364f959ec3ec867f01140fb,oTT3YRMN,35364,itMakesMeFeel.text,{},6364f959ec3ec867f01140fb,itRemindsMeOf.text,{},mia mamma perché è grazie a lei ed ai suoi ins...,6364f959ec3ec867f01140fb,itMakesMeThinkAbout.text,mia mamma perché è grazie a lei ed ai suoi ins...


In [21]:
dfText

Unnamed: 0,id_0,userid,origin,pname_0,pvalue_0,id_1,pname_1,pvalue_1,agg_pvalue,id,pname_2,pvalue_2
0,6364f4e9ad16cc2ce70b6f0f,HoNIcWZY,35330,itMakesMeFeel.text,{},6364f4e9ad16cc2ce70b6f0f,itRemindsMeOf.text,{},Prospettive,6364f4e9ad16cc2ce70b6f0f,itMakesMeThinkAbout.text,Prospettive
1,6364f4e9ad16cc2ce70b6f0f,HoNIcWZY,35229,itMakesMeFeel.text,{},6364f4e9ad16cc2ce70b6f0f,itRemindsMeOf.text,{},Mondi diversi,6364f4e9ad16cc2ce70b6f0f,itMakesMeThinkAbout.text,Mondi diversi
2,6364f4e9ad16cc2ce70b6f0f,HoNIcWZY,35380,itMakesMeFeel.text,{},6364f4e9ad16cc2ce70b6f0f,itRemindsMeOf.text,{},(forse) Onirici,6364f4e9ad16cc2ce70b6f0f,itMakesMeThinkAbout.text,(forse) Onirici
3,622764d6efc0335e1948a813,e4aM9WL7,35229,itMakesMeFeel.text,{},622764d6efc0335e1948a813,itRemindsMeOf.text,al mare\n\n,al mare\n\n,622764d6efc0335e1948a813,itMakesMeThinkAbout.text,{}
4,622764d6efc0335e1948a813,e4aM9WL7,39175,itMakesMeFeel.text,{},622764d6efc0335e1948a813,itRemindsMeOf.text,malattia\n,malattia\n,622764d6efc0335e1948a813,itMakesMeThinkAbout.text,{}
...,...,...,...,...,...,...,...,...,...,...,...,...
170,633856a77043f23cb5236441,onDqgFnI,35362,itMakesMeFeel.text,{},633856a77043f23cb5236441,itRemindsMeOf.text,{},,633856a77043f23cb5236441,itMakesMeThinkAbout.text,{}
171,633856a77043f23cb5236441,onDqgFnI,39416,itMakesMeFeel.text,{},633856a77043f23cb5236441,itRemindsMeOf.text,{},,633856a77043f23cb5236441,itMakesMeThinkAbout.text,{}
172,633856a77043f23cb5236441,onDqgFnI,35229,itMakesMeFeel.text,{},633856a77043f23cb5236441,itRemindsMeOf.text,{},,633856a77043f23cb5236441,itMakesMeThinkAbout.text,{}
173,6364f959ec3ec867f01140fb,oTT3YRMN,35364,itMakesMeFeel.text,{},6364f959ec3ec867f01140fb,itRemindsMeOf.text,{},mia mamma perché è grazie a lei ed ai suoi ins...,6364f959ec3ec867f01140fb,itMakesMeThinkAbout.text,mia mamma perché è grazie a lei ed ai suoi ins...


# Store in a JSON file


In [22]:
template = {
    'id': None,
    'userid': None,
    'origin': None,
    'source_id': 'fake90e6d701748f08514b01',
    'source': 'fake',
    'pname': None,
    'pvalue': None,
    'context': 'application',
    'datapoints': 0,
    'category': 'interest'
}

In [23]:
def createObject (rowData, template, pname):
    """Creates an object using a template according to data in rowData.
    """
    newObject = template.copy()
    newObject['id'] = rowData['id']
    newObject['userid'] = rowData['userid']
    newObject['origin'] = rowData['origin']
    newObject['pname'] = pname
    newObject['pvalue'] = rowData['agg_pvalue']
    return newObject

def addObjectToList (userid, dataFrame, rowIndexList, template, pname):
    """Creates a list of objects according to the information stored in a dataframe
    """
    aList = []
    for row in rowIndexList:
        # rowIndexList is a list of valid indexes in dataFrame
        newObject = createObject(dataFrame.loc[row],template,pname)
        aList.append(newObject)
    return aList


# Uncomment this line to see the structure of a Pandas Groupby
# (a dict where keys are userids and values are list of index values)
#dfEmotions.groupby(by=["userid"]).groups


In [24]:
newContribData = {
    'itMakesMeThinkAbout_emotions' : {},
    'itMakesMeThinkAbout_sentiment' : {},
    'itMakesMeThinkAbout_text' : {},
}
for userid, rowIndexList in dfEmotions.groupby(by=["userid"]).groups.items():
    contribList = addObjectToList (userid, dfEmotions, rowIndexList, template, 'itMakesMeThinkAbout.emotions')
    newContribData['itMakesMeThinkAbout_emotions'][userid] = contribList

for userid, rowIndexList in dfSentiment.groupby(by=["userid"]).groups.items():
    contribList = addObjectToList (userid, dfSentiment, rowIndexList, template, 'itMakesMeThinkAbout.sentiment')
    newContribData['itMakesMeThinkAbout_sentiment'][userid] = contribList

for userid, rowIndexList in dfText.groupby(by=["userid"]).groups.items():
    contribList = addObjectToList (userid, dfText, rowIndexList, template, 'itMakesMeThinkAbout.text')
    newContribData['itMakesMeThinkAbout_text'][userid] = contribList

newContribData

{'itMakesMeThinkAbout_emotions': {'0bAAwK14': [{'id': '63313e7848ebba445a3b3cee',
    'userid': '0bAAwK14',
    'origin': '35450',
    'source_id': 'fake90e6d701748f08514b01',
    'source': 'fake',
    'pname': 'itMakesMeThinkAbout.emotions',
    'pvalue': {'Sadness': 2.407580425, 'Interest': 1.0, 'Joy': 0.8233193},
    'context': 'application',
    'datapoints': 0,
    'category': 'interest'},
   {'id': '63313e7848ebba445a3b3cee',
    'userid': '0bAAwK14',
    'origin': '35215',
    'source_id': 'fake90e6d701748f08514b01',
    'source': 'fake',
    'pname': 'itMakesMeThinkAbout.emotions',
    'pvalue': {'Sadness': 1.767029006666667},
    'context': 'application',
    'datapoints': 0,
    'category': 'interest'}],
  '2GROTNeg': [{'id': '63a3248b88aacc4c1b0e3069',
    'userid': '2GROTNeg',
    'origin': '35381',
    'source_id': 'fake90e6d701748f08514b01',
    'source': 'fake',
    'pname': 'itMakesMeThinkAbout.emotions',
    'pvalue': {'Interest': 1.0},
    'context': 'application',
  

In [25]:
with open("./data/ugcContributions_merged.json", "w") as outfile:
    json.dump(newContribData, outfile, indent=2)