In [1]:
import pyspark
import random
sc = pyspark.SparkContext(appName="Pi")

In [8]:
# usersPath = "data/users.txt" # Useless for this program
actionsPath = "actions.txt"
appsPath  = "apps.txt"

outputPath1 = "outPart1/"
outputPath2 = "outPart2/"

In [9]:
# Define the rdds associated with the input files

# Actions line format: UserID, AppID, Timestamp, Action
# timestamp format: YYYY/MM/DD-HH:MM:SS
actionsRDD = sc.textFile(actionsPath)

# apps line format: AppId, Name, Price, Category, Company
appsRDD = sc.textFile(appsPath)

In [10]:
#########################################
# PART 1
#########################################

In [11]:
# Select the actions performed in 2021
# since we are interested only in installation and removal actions, a condition
# on the type of action is also introduced

def filter2021InstallRemove(line):
    fields = line.split(",")
    action = fields[3]
    year = int(fields[2].split("/")[0])
    
    return year == 2021 and (action=="Install" or action=="Remove")

actions2021 = actionsRDD.filter(filter2021InstallRemove)

In [14]:
# Create a PairRDD with
# key = appId, Month
# value = install/removal
# where the value is either +1 (install) or -1 (removal), depending on the type of action the line is
# associated with
#
# Then use reduceByKey to count the difference between the number of
# installations and removals of each app in a specific month of 2021


def mapAppIdMonthInstallRemoval(line):
    fields = line.split(",")
    appId = fields[1]
    month = fields[2].split("/")[1]
    action = fields[3]
    
    if (action=="Install"):
        value = 1
    else:
        value = -1
       
    return ((appId, month), value)


appMonthInstallRemoval = actions2021\
            .map(mapAppIdMonthInstallRemoval)\
            .reduceByKey(lambda v1, v2: v1 + v2)

In [16]:
# Select only those apps and months in which #installations > #removals (i.e,
# those pairs with value>0)
appMonthWithMoreInstall = appMonthInstallRemoval\
                .filter(lambda pair: pair[1] > 0)

In [17]:
# transform the ((appId, Month), n. installation-n. removals)) into the
# following pairRDD
# key = appId
# value = +1
# and use reduceByKey to count the number of months in 2021 that the app of
# interest (key) had #install > #removals
#
# Use then a filter to select only those apps for which count == 12.
appAll2021 = appMonthWithMoreInstall\
                    .map(lambda it: (it[0][0], 1))\
                    .reduceByKey(lambda it1, it2: it1 + it2)\
                    .filter(lambda it: it[1] == 12)

In [18]:
# Starting from the content of Apps.txt, obtain a pairRDD with
# key = appId
# value = app name

def mapAppIDAppName(line):
    fields = line.split(",")
    appId = fields[0]
    appName = fields[1]
    
    return (appId, appName)


appIdName = appsRDD.map(mapAppIDAppName)

In [19]:
# Join the two RDDs and keep only the fields of interest to get the final result
res1 = appAll2021.join(appIdName)\
                .map(lambda t: (t[0], t[1][1]))

In [20]:
# Store the result in the first output folder
# res1.saveAsTextFile(outputPath1)
res1.collect()

[('C1app3', 'app3'), ('C1app1', 'app1')]

In [21]:
#########################################
# PART 2
#########################################

In [22]:
# Select only the "Install" actions
# and prepare a PairRDD with
# key = (appId, userId)
# value = (Timestamp>31/12/2021, Timestamp<01/01/2022)

def mapAppIDUserIDAfter21Before22(line): 
    fields = line.split(",")
    userId = fields[0]
    appId = fields[1]
    date = fields[2].split("-")[0]
    
    if (date>"2021/12/31"):
        return ((appId, userId), (1, 0))
    else:
        return ((appId, userId), (0, 1))


appUserInstall = actionsRDD\
                    .filter(lambda line: line.split(",")[3]=="Install")\
                    .map(mapAppIDUserIDAfter21Before22)

In [23]:
# Count for each key userId, appId how main installations after 2021/12/31
# and how many installations before 2022/01/01 using reduceByKey
appUserTotInstall = appUserInstall\
                    .reduceByKey(lambda v1, v2 : (v1[0] + v2[0], v1[1] + v2[1]))

In [24]:
# Filter only combinations userId, appId with installation after 2021/12/31
# and no installations before 2022/01/01
# Those users are new users after 2021/12/31 of the apps occurring in the key part
appsNewUsersPairRDD = appUserTotInstall\
                        .filter(lambda pair: (pair[1][0]>0  and pair[1][1]==0))

In [25]:
# Select only the key part
appsNewUsers = appsNewUsersPairRDD.keys()

In [26]:
# count for each app, the number of new distinct users after 31/12/2021
# Map to pairs:
# key: appId
# value: +1
# Then use reduceByKey to count
newInstall2022PerApp = appsNewUsers\
                        .map(lambda pair: (pair[0], 1))\
                        .reduceByKey(lambda i1, i2: i1 + i2)\
                        .cache()

In [27]:
# Extract the maximum number of new distinct users after 31/12/2021
maxInstallations = newInstall2022PerApp\
                            .values()\
                            .reduce(lambda i1, i2: max(i1, i2))

In [28]:
# Select only the apps that achieved the maximum number of new distinct users
res2 = newInstall2022PerApp\
            .filter(lambda t: t[1]==maxInstallations)\
            .keys()

In [29]:
# Store the result in the second output folder
# res2.saveAsTextFile(outputPath2)
res2.collect()

['C2app2', 'C2app3', 'C2app4']