## Connection to the Azure SQL Database

Defined some variables to programmatically create the connection to the SQL Database.

In [None]:
jdbcUsername = "feuplogin"
jdbcPassword = "Logproject33"
jdbcHostname = "intranet14.database.windows.net"
jdbcPort = 1433
jdbcDatabase = "intranetfeupp14"

jdbcUrl = "jdbc:sqlserver://{0}:{1};database={2};encrypt=true;trustServerCertificate=false;hostNameInCertificate=*.database.windows.net;loginTimeout=30;".format(jdbcHostname, jdbcPort, jdbcDatabase)

connectionProperties = {
  "user": jdbcUsername,
  "password": jdbcPassword,
  "driver": "com.microsoft.sqlserver.jdbc.SQLServerDriver"
}

## Read and Transform Data

### Calendar Dimension

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Produtividade").getOrCreate()

import datetime
from pyspark.sql.functions import expr, sequence

# to have the months in portuguese - locale unsuported for having the months in portuguese
#import locale
#locale.setlocale(locale.LC_TIME, 'pt_PT.utf8')

i_date = spark.read.jdbc(url=jdbcUrl, table="(SELECT TOP(1) [DataInicio] FROM [stg].[EXT_TBL_IMPUTACAO_DETALHE] ORDER BY [DataInicio] ASC) AS query", properties=connectionProperties)

f_date = spark.read.jdbc(url=jdbcUrl, table="(SELECT TOP(1) [DataFim] FROM [stg].[EXT_TBL_ORCAMENTO] ORDER BY [DataFim] DESC) AS query", properties=connectionProperties)

initial_date = expr("to_timestamp('{}')".format(i_date.collect()[0][0]))
final_date = expr("to_timestamp('{}')".format(f_date.collect()[0][0]))

timestamps_array = sequence(initial_date, final_date, expr("interval 1 month")).alias("timestamps")

for row in spark.range(1).select(timestamps_array).collect():
    for timestamp in row.timestamps:
        if (timestamp.month >= 10):
            id = str(timestamp.year) + str(timestamp.month)
            id = int(id)
            year_month_num = str(timestamp.year) +"-" + str(timestamp.month)
        else:
            id = str(timestamp.year) + "0" + str(timestamp.month)
            id = int(id)
            year_month_num = str(timestamp.year) +"-0" + str(timestamp.month)
            
        #month_extensive = timestamp.strftime('%B') formats the date for just the month
        if timestamp.month == 1:
          month_extensive = 'Janeiro'
        elif timestamp.month == 2:
          month_extensive = 'Fevereiro'
        elif timestamp.month == 3:
          month_extensive = 'Março'
        elif timestamp.month == 4:
          month_extensive = 'Abril'
        elif timestamp.month == 5:
          month_extensive = 'Maio'
        elif timestamp.month == 6:
          month_extensive = 'Junho'
        elif timestamp.month == 7:
          month_extensive = 'Julho'
        elif timestamp.month == 8:
          month_extensive = 'Agosto'
        elif timestamp.month == 9:
          month_extensive = 'Setembro'
        elif timestamp.month == 10:
          month_extensive = 'Outubro'
        elif timestamp.month == 11:
          month_extensive = 'Novembro'
        elif timestamp.month == 12:
          month_extensive = 'Dezembro'

        month_abbreviated = month_extensive[:3]
        month_abbreviated_year = month_abbreviated + " " + str(timestamp.year)
        month_extensive_year = month_extensive + " " + str(timestamp.year)

        month_to_quarter = {1:1, 2:1, 3:1, 4:2, 5:2, 6:2, 7:3, 8:3, 9:3, 10:4, 11:4, 12:4}
        quarter = month_to_quarter.pop(timestamp.month)
        quarter_extensive = str(timestamp.year) + " Trimestre " + str(quarter)
        
        year_string = str(timestamp.year)
        month_string = str(timestamp.month)
        
        # Creates the dataframe with the data needed
        table = [(id, timestamp.month, month_abbreviated, month_extensive, timestamp.year, year_month_num, month_abbreviated_year, month_extensive_year, quarter, quarter_extensive)]
        df = spark.createDataFrame(table, ["ID_Calendario", "Mes", "Mes_Abrev", "Mes_Extenso", "Ano", "Ano_Mes_Num", "Mes_Abrev_E_Ano", "Mes_Extenso_E_Ano", "Trimestre_Num", "Trimestre"])
    
        print(str(id) + " | " + str(timestamp.month) + " | " + month_abbreviated + " | " + month_extensive + " | " + str(timestamp.year) + " | " + str(year_month_num) + " | " + month_abbreviated_year + " | " + month_extensive_year + " | " + str(quarter) + " | " + quarter_extensive)

        # Uploads the data to the dimension
        df.write.jdbc(url=jdbcUrl, table="[dwProdutividade].[DIM_CALENDARIO]", mode="append", properties=connectionProperties)

# for tasks without time designated

table = [(0, 0, '---', 'Não Definido', 0, '---', 'Não Definido', 'Não Definido', 0, 'Não Definido')]
df = spark.createDataFrame(table, ["ID_Calendario", "Mes", "Mes_Abrev", "Mes_Extenso", "Ano", "Ano_Mes_Num", "Mes_Abrev_E_Ano", "Mes_Extenso_E_Ano", "Trimestre_Num", "Trimestre"])
    
print(str(id) + " | " + str(month) + " | " + month_abbreviated + " | " + month_extensive + " | " + str(year) + " | " + str(year_month_num) + " | " + month_abbreviated_year + " | " + month_extensive_year + " | " + str(quarter) + " | " + quarter_extensive)

df.write.jdbc(url=jdbcUrl, table="[dwProdutividade].[DIM_CALENDARIO]", mode="append", properties=connectionProperties)

### Productivity Classification Dimension

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Produtividade").getOrCreate()


intervals = ["[0%, 100%[", "100%", "]100%, 125%]", "]125%, 150%]", "]150%, 175%]", "]175%, +∞]"]
notes = ["Menos do Previsto", "Como Previsto", "Até 25% Horas Extra Consumidas", "Até 50% Horas Extra Consumidas", "Até 75% Horas Extra Consumidas", "Mais de 75% Horas Extra Consumidas"]
number_notes = [5, 4, 3, 2, 1, 0]

for i in range(6):
  df = spark.createDataFrame([(i, intervals[i], notes[i], number_notes[i])], ["ID_Classificacao_Produtividade", "Intervalo", "Nota", "Nota_Num"])
  df.write.jdbc(url=jdbcUrl, table="[dwProdutividade].[DIM_CLASSIFICACAO_PRODUTIVIDADE]", mode="append", properties=connectionProperties)
  print(str(i) + " | " + intervals[i] + " | " + notes[i] + " | " + str(number_notes[i]))

  i += 1

df = spark.createDataFrame([(6, 'Inválido', 'Inválida', -1)], ["ID_Classificacao_Produtividade", "Intervalo", "Nota", "Nota_Num"])
df.write.jdbc(url=jdbcUrl, table="[dwProdutividade].[DIM_CLASSIFICACAO_PRODUTIVIDADE]", mode="append", properties=connectionProperties)
print(str(6) + " | " + 'Inválido' + " | " + 'Inválida' + " | " + str(-1))

### State Dimension

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Produtividade").getOrCreate()
  
states_font = spark.read.jdbc(url=jdbcUrl, table="(select distinct [Estado] from [stg].[EXT_TBL_PROJETOS]) AS query", properties=connectionProperties).collect()

states = []

for row in states_font:
    if row['Estado'] not in states:
        states.append(row['Estado'])
        
df = spark.createDataFrame([(0, 'Não Definido')], ["ID_Estado", "Estado"])
df.write.jdbc(url=jdbcUrl, table="[dwProdutividade].[DIM_ESTADO]", mode="append", properties=connectionProperties)
print("0 | Não Definido")

for i in range(len(states)):   
  df = spark.createDataFrame([(i + 1, states[i])], ["ID_Estado", "Estado"])
  df.write.jdbc(url=jdbcUrl, table="[dwProdutividade].[DIM_ESTADO]", mode="append", properties=connectionProperties)
  print(str(i + 1) + " | " + states[i])
  
  i += 1

### Profile Dimension

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Produtividade").getOrCreate()

profiles_font = spark.read.jdbc(url=jdbcUrl, table="(select distinct [Perfil] from [stg].[EXT_TBL_ORCAMENTO]) AS query", properties=connectionProperties).collect()

profiles = []

for row in profiles_font:
    if row['Perfil'] not in profiles:
        profiles.append(row['Perfil'])
      
# Sorts profiles by number
sorted_profiles = sorted(profiles, key=lambda x: int(x[0:2]))
  
df = spark.createDataFrame([(0, 'Não definido', 0, 'Não definida')], ["ID_Perfil", "Nome_Perfil", "Num_Perfil", "Posicao_Perfil"])
df.write.jdbc(url=jdbcUrl, table="[dwProdutividade].[DIM_PERFIL]", mode="append", properties=connectionProperties)

i = 1
for profile in sorted_profiles:
  if int(profile[0:2]) in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    print(str(i) + " | " + profile + " | " + str(int(profile[0:2])) + " | " + profile[4:])
    
    df = spark.createDataFrame([(i, profile, int(profile[0:2]), profile[4:])], ["ID_Perfil", "Nome_Perfil", "Num_Perfil", "Posicao_Perfil"])
    df.write.jdbc(url=jdbcUrl, table="[dwProdutividade].[DIM_PERFIL]", mode="append", properties=connectionProperties)
    
  else:
    print(str(i) + " | " + profile + " | " + str(int(profile[0:2])) + " | " + profile[5:])
    
    df = spark.createDataFrame([(i, profile, int(profile[0:2]), profile[5:])], ["ID_Perfil", "Nome_Perfil", "Num_Perfil", "Posicao_Perfil"])
    df.write.jdbc(url=jdbcUrl, table="[dwProdutividade].[DIM_PERFIL]", mode="append", properties=connectionProperties)
  
  i += 1

### Employee Dimension

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Produtividade").getOrCreate()

employees_font = spark.read.jdbc(url=jdbcUrl, table="(select distinct [Username] FROM [stg].[EXT_TBL_IMPUTACAO_DETALHE]) AS query", properties=connectionProperties).collect()       # already returns the usernames sorted by the identifier number (ex: utilizador.231 - ordered by 231)

employees = []

for row in employees_font:
    if row['Username'] not in employees:
        employees.append(row['Username'])
  
df = spark.createDataFrame([(0, 'Não Definido')], ["ID_Funcionario", "Nome_Funcionario"])
df.write.jdbc(url=jdbcUrl, table="[dwProdutividade].[DIM_FUNCIONARIO]", mode="append", properties=connectionProperties)

i = 1
for employee in employees:
  print(str(i) + " | " + employee)

  df = spark.createDataFrame([(i, employee)], ["ID_Funcionario", "Nome_Funcionario"])
  df.write.jdbc(url=jdbcUrl, table="[dwProdutividade].[DIM_FUNCIONARIO]", mode="append", properties=connectionProperties)
  
  i += 1

### Task Dimension

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Produtividade").getOrCreate()

tasks = spark.read.jdbc(url=jdbcUrl, table="(select [ID], [CodigoProjeto], [Username], [Tarefa], [NHoras] from [stg].[EXT_TBL_HORASPREVISTAS]) AS query", properties=connectionProperties).collect()

'''
ids = tasks[i][0]
projects = tasks[i][1]
tasks = tasks[i][2]
expected_hours = tasks[i][3]'''

for task in tasks:
  #print(str(task[0]) + " | " + task[1] + " | " + task[2] + " | " + str(int(task[3])))
  
  id = task[0]
  
  project = task[1]
  
  employee_name = task[2]
  employee_number = task[2][11:]
  
  task_name = task[3]
  
  expected_hours = int(task[4])
  
  #project_task_hours = project + "_employee" + employee_number + "_" + str(expected_hours) + "hours"
  project_task_hours = project + " funcionário " + employee_number + " (" + str(expected_hours) + " horas)"
  
  
  print(str(id) + " | " + project + " | " + employee_name + " | " + task_name + " | " + str(expected_hours) + " | " + project_task_hours)
  
  df = spark.createDataFrame([(id, task_name, project_task_hours, expected_hours)], ["ID_Tarefa", "Nome_Tarefa", "Projeto_Funcionario_Horas", "Horas_Previstas_Tarefa"])
  df.write.jdbc(url=jdbcUrl, table="[dwProdutividade].[DIM_TAREFA]", mode="append", properties=connectionProperties)

df = spark.createDataFrame([(0, 'Não Definido', 'Não Definido', 0)], ["ID_Tarefa", "Nome_Tarefa", "Projeto_Funcionario_Horas", "Horas_Previstas_Tarefa"])
df.write.jdbc(url=jdbcUrl, table="[dwProdutividade].[DIM_TAREFA]", mode="append", properties=connectionProperties)

### Project Dimension

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Produtividade").getOrCreate()

projects = spark.read.jdbc(url=jdbcUrl, table="(select P.[CodigoProjeto], [Departamento], [Area], sum(H.[NHoras]) as [HorasPrevistas] from [stg].[EXT_TBL_PROJETOS] as P join [stg].[EXT_TBL_HORASPREVISTAS] as H on P.[CodigoProjeto] = H.[CodigoProjeto] group by P.[CodigoProjeto], [Departamento],[Area]) AS query", properties=connectionProperties).collect()

''' for better vizualization
select P.[CodigoProjeto], [Departamento], [Area], sum(H.[NHoras]) as [HorasPrevistas]
from [stg].[EXT_TBL_PROJETOS] as P join [stg].[EXT_TBL_HORASPREVISTAS] as H
on P.[CodigoProjeto] = H.[CodigoProjeto]
group by P.[CodigoProjeto], [Departamento],[Area]
'''

# project not defined for tasks without project id

id = 0
project_code = "NAO.DEFINIDO"
project_name = "Projeto " + project_code
department = "Não Especificado"
area = "Não Especificada"
expected_hours = 0
#print(str(id) + " | " + project_code + " | " + project_name + " | " + department + " | " + area + " | " + str(expected_hours) + " horas previstas")

df = spark.createDataFrame([(id, project_code, project_name, department, area, expected_hours)], ["ID_Projeto", "Codigo_Projeto", "Nome_Projeto", "Departamento", "Area", "Horas_Previstas_Projeto"])
df.write.jdbc(url=jdbcUrl, table="[dwProdutividade].[DIM_PROJETO]", mode="append", properties=connectionProperties)


i = 1

for project in projects:
  id = i
  
  project_code = project[0] #CodigoProjeto
  
  project_name = "Projeto " + project_code

  if project[1] == None:
    department = "Não Especificado"
  else:
    department = project[1]

  if project[2] == None:
    area = "Não Especificada"
  else:
    area = project[2]
  
  expected_hours = project[3]  #NHoras from the join of PROJETOS with HORASPREVISTAS

  print(str(id) + " | " + project_code + " | " + project_name + " | " + department + " | " + area + " | " + str(expected_hours) + " horas previstas")

  df = spark.createDataFrame([(id, project_code, project_name, department, area, expected_hours)], ["ID_Projeto", "Codigo_Projeto", "Nome_Projeto", "Departamento", "Area", "Horas_Previstas_Projeto"])
  df.write.jdbc(url=jdbcUrl, table="[dwProdutividade].[DIM_PROJETO]", mode="append", properties=connectionProperties)

  i += 1


### Task Fact

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Produtividade").getOrCreate()

imputations = spark.read.jdbc(url=jdbcUrl, table="(select [CodigoProjeto], [Username], [Ano], [Mes], [FK_TarefaID], sum([NHoras]) as HorasRealizadas from [stg].[EXT_TBL_IMPUTACAO_DETALHE] group by [CodigoProjeto], [Username], [Ano], [Mes], [FK_TarefaID]) AS query", properties=connectionProperties).collect()

'''for better visualization
select [CodigoProjeto], [Username], [Ano], [Mes], [FK_TarefaID], sum([NHoras]) as horas_realizadas
from [stg].[EXT_TBL_IMPUTACAO_DETALHE]
group by [CodigoProjeto], [Username], [Ano], [Mes], [FK_TarefaID]
'''

i = 0
for imputation in imputations:
  # Collected Data:
  # [CodigoProjeto]   --imputation[0]
  # [Username]        --imputation[1]
  # [Ano]             --imputation[2]
  # [Mes]             --imputation[3]
  # [FK_TarefaID]     --imputation[4]
  # HorasRealizadas   --imputation[5]

  # Pretended Result:
  # [ID] [int] PRIMARY KEY,
	# [Horas_Realizadas] [int],
	# [ID_Projeto] [int],
	# [ID_Calendario] [int],
	# [ID_Tarefa] [int],
	# [ID_Funcionario] [int],
  # [ID_Perfil] [int],
	# [ID_Classificacao_Produtividade_Tarefa] [int]

  # ID
  id = i


  # Hours Performed
  hours_performed = imputation[5]
  

  # Project
  query = "(select [ID_Projeto] from [dwProdutividade].[DIM_PROJETO] where [Codigo_Projeto] = '" + imputation[0] + "') AS query"
  project = spark.read.jdbc(url=jdbcUrl, table=query, properties=connectionProperties).collect()
  if not project:
    project_id = 0
  else:
    project_id = project[0][0]


  # Calendar
  if not imputation[2] or not imputation[3]:
    calendar_id = 0
  else:
    query = "(select [ID_Calendario] from [dwProdutividade].[DIM_CALENDARIO] where [Mes] = " + str(imputation[3]) + " and [Ano] = " + str(imputation[2]) + ") AS query"
    calendar = spark.read.jdbc(url=jdbcUrl, table=query, properties=connectionProperties).collect()
    if not calendar:
      calendar_id = 0
    else:
      calendar_id = calendar[0][0]


  # Task
  if not imputation[4]:
    # go to the tasks table and look for the id of the task with the same CodigoProjeto and the same Username
    query = "(select [Tarefa] from [stg].[EXT_TBL_HORASPREVISTAS] where [CodigoProjeto] = '" + imputation[0] + "' and [Username] = '" + imputation[1] + "') AS query"
    task_info = spark.read.jdbc(url=jdbcUrl, table=query, properties=connectionProperties).collect()
    if not task_info:
      task_id = 0
      query = "(select [ID_Tarefa], [Horas_Previstas_Tarefa], [Nome_Tarefa] from [dwProdutividade].[DIM_TAREFA] where [ID_Tarefa] = " + str(task_id) + ") AS query"
      task = spark.read.jdbc(url=jdbcUrl, table=query, properties=connectionProperties).collect()  # to be used in the productivity classification calculation
    else:
      query = "(select [ID_Tarefa], [Horas_Previstas_Tarefa], [Nome_Tarefa] from [dwProdutividade].[DIM_TAREFA] where [Nome_Tarefa] = '" + task_info[0][0] + "') AS query"
      task = spark.read.jdbc(url=jdbcUrl, table=query, properties=connectionProperties).collect()
      task_id = task[0][0]
  else:
    task_id = imputation[4] # check if it works well, if not use above process
    query = "(select [ID_Tarefa], [Horas_Previstas_Tarefa], [Nome_Tarefa] from [dwProdutividade].[DIM_TAREFA] where [ID_Tarefa] = " + str(task_id) + ") AS query"
    task = spark.read.jdbc(url=jdbcUrl, table=query, properties=connectionProperties).collect()  # to be used in the productivity classification calculation


  # Employee
  query = "(select [ID_Funcionario] from [dwProdutividade].[DIM_FUNCIONARIO] where [Nome_Funcionario] = '" + imputation[1] + "') AS query"
  employee = spark.read.jdbc(url=jdbcUrl, table=query, properties=connectionProperties).collect()
  if not employee:
    employee_id = 0
  else:
    employee_id = employee[0][0]


  # Profile
  if not task:
    profile_id = 0
  else:
    #query = "(select distinct [Perfil] from [stg].[EXT_TBL_ORCAMENTO] where [CodigoProjeto] = '" + imputation[0] + "' and [Username] = '" + imputation[1] + "') AS query"
    query = "(select [Perfil] from [stg].[EXT_TBL_HORASPREVISTAS] as H join [stg].[EXT_TBL_ORCAMENTO] as O on H.[OrcamentoID] = O.[ID] where H.[CodigoProjeto] = '" + imputation[0] + "' and [Username] = '" + imputation[1] + "' and H.[Tarefa] = '" + task[0][2] + "') AS query"
    #task[0][2] -> Nome Tarefa == Tarefa em Horas Previstas
    profile_info = spark.read.jdbc(url=jdbcUrl, table=query, properties=connectionProperties).collect()
    if not profile_info:
      profile_id = 0
    else:
      query = "(select [ID_Perfil] from [dwProdutividade].[DIM_PERFIL] where [Nome_Perfil] = '" + profile_info[0][0] + "') AS query"
      profile = spark.read.jdbc(url=jdbcUrl, table=query, properties=connectionProperties).collect()
      if not profile:
        profile_id = 0
      else:
        profile_id = profile[0][0]

  ''' for better visualization
  select [Perfil]
  from [stg].[EXT_TBL_HORASPREVISTAS] as H join [stg].[EXT_TBL_ORCAMENTO] as O
  on H.[OrcamentoID] = O.[ID]
  where H.[CodigoProjeto] = '" + imputation[0] + "' and [Username] = '" + imputation[1] + "' and H.[Tarefa] = '" + task[0][2] + "'"
  # where H.[CodigoProjeto] = 'DGIE.2023.304' and [Username] = 'utilizador.355' and H.[Tarefa] = 'Tarefa - Utilizador #355'
  '''


  # Productivity Classification - in a global perspective, not in a month related one
  '''
  [0%, 100%[ - 5        - classification id 0
  100% - 4              - classification id 1
  ]100%, 125%] - 3      - classification id 2
  ]125%, 150%] - 2      - classification id 3
  ]150%, 175%] - 1      - classification id 4
  ]175%, +∞] - 0        - classification id 5
  '''

  if not task: #or expected == 0:
    classification_id = 6
  else:
    expected = task[0][1] 
    #realized = imputation[5] - monthly
    query = "(select [CodigoProjeto], [Username], [FK_TarefaID], sum([NHoras]) as HorasRealizadas from [stg].[EXT_TBL_IMPUTACAO_DETALHE] where [CodigoProjeto] = '" + imputation[0] + "' and [Username] = '" + imputation[1] + "' and [FK_TarefaID] = " + str(task_id) + " group by [CodigoProjeto], [Username], [FK_TarefaID]) AS query"
    realized = spark.read.jdbc(url=jdbcUrl, table=query, properties=connectionProperties).collect()

    # verify if the condition of FK_TarefaID works

    ''' for better visualization
    select [CodigoProjeto], [Username], [FK_TarefaID], sum([NHoras]) as HorasRealizadas 
    from [stg].[EXT_TBL_IMPUTACAO_DETALHE] 
    group by [CodigoProjeto], [Username], [FK_TarefaID]
    order by [CodigoProjeto], [Username]
    '''

    ''' this query is for a month related calculus, if it was needed the hours realized in a certain month
    select [CodigoProjeto], [Username], [Ano], [Mes], [FK_TarefaID], sum([NHoras]) as HorasRealizadas 
    from [stg].[EXT_TBL_IMPUTACAO_DETALHE] 
    group by [CodigoProjeto], [Username], [Ano], [Mes], [FK_TarefaID]
    where [CodigoProjeto] = imputation[0] and [Username] = imputation[1] and [Ano] = str(imputation[2]) and [Mes] = str(imputation[3]) and [FK_TarefaID] = str(task_id)
    '''

    if expected == 0:
      classification_id = 6   # if it is not a value for hours expected
    else:
      if not realized:
        classification_id = 6
      else:
        quotient = realized[0][3] / expected #realized[0][3] is the realized hours sum column
        print(quotient)

        if quotient < 1:
          classification_id = 0
        elif quotient == 1:
          classification_id = 1
        elif quotient <= 1.25:
          classification_id = 2
        elif quotient <= 1.50:
          classification_id = 3
        elif quotient <= 1.75:
          classification_id = 4
        else: 
          classification_id = 5

  df = spark.createDataFrame([(id, hours_performed, project_id, calendar_id, task_id, employee_id, profile_id, classification_id)], ["ID", "Horas_Realizadas", "ID_Projeto", "ID_Calendario", "ID_Tarefa", "ID_Funcionario", "ID_Perfil", "ID_Classificacao_Produtividade_Tarefa"])
  df.write.jdbc(url=jdbcUrl, table="[dwProdutividade].[FACTO_TAREFA]", mode="append", properties=connectionProperties)

  print(str(id) + " | " + str(hours_performed) + " horas realizadas | " +  str(project_id) + " | " + str(calendar_id) + " | " + str(task_id) + " | " + str(employee_id) + " | " + str(profile_id) + " | " + str(classification_id))

  i += 1

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Produtividade").getOrCreate()
'''
code = 'ABR.2016.016'
query = "(select [CodigoProjeto], [Username], [FK_TarefaID], sum([NHoras]) as HorasRealizadas from [stg].[EXT_TBL_IMPUTACAO_DETALHE] where [CodigoProjeto] = '" + code + "' and [Username] = 'utilizador.211' and [FK_TarefaID] = 0 group by [CodigoProjeto], [Username], [FK_TarefaID]) AS query"
realized = spark.read.jdbc(url=jdbcUrl, table=query, properties=connectionProperties).collect()

print(realized[0][0])
print(realized[0][1])
print(realized[0][2])
print(realized[0][3])'''
'''
query = "(select [ID_Calendario] from [dwProdutividade].[DIM_CALENDARIO] where [Mes] = 5 and [Ano] = 2017) AS query"
calendar = spark.read.jdbc(url=jdbcUrl, table=query, properties=connectionProperties).collect()
print(calendar[0][0])'''
'''
query = "(select [ID_Funcionario] from [dwProdutividade].[DIM_FUNCIONARIO] where [Nome_Funcionario] = 'utilizador.213') AS query"
employee = spark.read.jdbc(url=jdbcUrl, table=query, properties=connectionProperties).collect()
print(str(employee[0][0]))'''

query = "(select [Perfil] from [stg].[EXT_TBL_HORASPREVISTAS] as H join [stg].[EXT_TBL_ORCAMENTO] as O on H.[OrcamentoID] = O.[ID] where H.[CodigoProjeto] = 'DGIE.2023.304' and [Username] = 'utilizador.355' and H.[Tarefa] = 'Tarefa - Utilizador #355') AS query"
profile_info = spark.read.jdbc(url=jdbcUrl, table=query, properties=connectionProperties).collect()
print(profile_info[0][0])
print(profile_info[0][0] == None)

### Project Fact

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Produtividade").getOrCreate()

# Total hours executed in each project in each month

projects_mensal = spark.read.jdbc(url=jdbcUrl, table="(select [CodigoProjeto], [Ano], [Mes], sum([HorasRealizadas]) as TotalHorasRealizadas from ( select [CodigoProjeto], [Username], [Ano], [Mes], [FK_TarefaID], sum([NHoras]) as HorasRealizadas from [stg].[EXT_TBL_IMPUTACAO_DETALHE] group by [CodigoProjeto], [Username], [Ano], [Mes], [FK_TarefaID]) as T group by [CodigoProjeto], [Ano], [Mes]) AS query", properties=connectionProperties).collect()

''' for better visualization
select [CodigoProjeto], [Ano], [Mes], sum([HorasRealizadas]) as TotalHorasRealizadas
from (
    select [CodigoProjeto], [Username], [Ano], [Mes], [FK_TarefaID], sum([NHoras]) as HorasRealizadas 
from [stg].[EXT_TBL_IMPUTACAO_DETALHE] 
group by [CodigoProjeto], [Username], [Ano], [Mes], [FK_TarefaID]) as T
group by [CodigoProjeto], [Ano], [Mes]
'''

i = 0

for project in projects_mensal:

  # Collected Data:
  # [CodigoProjeto]   --project[0]
  # [Ano]             --project[1]
  # [Mes]             --project[2]
  # HorasRealizadas   --project[3]

  # Pretended Result:
  # [ID] [int] PRIMARY KEY,
  # [Avanco] [int],
  # [Horas_Realizadas] [int],
  # [ID_Estado] [int],
  # [ID_Projeto] [int],
  # [ID_Calendario] [int],
  # [ID_Classificacao_Produtividade_Tarefa] [int]

  # ID
  id = i


  # Advance
  advance = 0
  month = project[2]
  year = project[1]
  min_year = 2010
  condition = True
  while year >= 2010 and condition:
    while month >= 1 and condition:
      query = "(select [Avanco] from [stg].[EXT_TBL_HISTORICO_AVANCOS] where [CodigoProjecto] = '" + project[0] + "' and MONTH(DataAvanco) = '" + str(month) + "' and YEAR(DataAvanco) = '" + str(year) + "') AS query"
      advance_info = spark.read.jdbc(url=jdbcUrl, table=query, properties=connectionProperties).collect()

      if advance_info:
        advance = advance_info[0][0]
        condition = False

      month -= 1
    year -= 1


  # Hours Performed
  hours_performed = project[3]


  # State
  # it is a fixed value, it is available only on stg.EXT_TBL_PROJETOS table
  query = "(select [Estado] from [stg].[EXT_TBL_PROJETOS] where [CodigoProjeto] = '" + project[0] + "') AS query"
  state = spark.read.jdbc(url=jdbcUrl, table=query, properties=connectionProperties).collect()
  if not state:
    state_id = 0   # Undefined state for exceptional cases
  else:
    state_id = state[0][0]


  # Project
  query = "(select [ID_Projeto], [Horas_Previstas_Projeto] from [dwProdutividade].[DIM_PROJETO] where [Codigo_Projeto] = '" + project[0] + "') AS query"
  project_info = spark.read.jdbc(url=jdbcUrl, table=query, properties=connectionProperties).collect()      # also useful for productivity classification calculus
  if not project_info:
    project_id = 0
  else:
    project_id = project_info[0][0]


  # Calendar
  if not project[1] or not project[2]:    # Year ; Month
    calendar_id = 0
  else:
    query = "(select [ID_Calendario] from [dwProdutividade].[DIM_CALENDARIO] where [Mes] = " + str(project[2]) + " and [Ano] = " + str(project[1]) + ") AS query"
    calendar = spark.read.jdbc(url=jdbcUrl, table=query, properties=connectionProperties).collect()
    if not calendar:
      calendar_id = 0
    else:
      calendar_id = calendar[0][0]


  # Productivity Classification
  query = "(select sum([HorasRealizadas]) as TotalHorasRealizadas from ( select [CodigoProjeto], [Username], [Ano], [Mes], [FK_TarefaID], sum([NHoras]) as HorasRealizadas from [stg].[EXT_TBL_IMPUTACAO_DETALHE] group by [CodigoProjeto], [Username], [Ano], [Mes], [FK_TarefaID]) as T where [CodigoProjeto] = '" + project[0] + "' group by [CodigoProjeto]) AS query"
  realized = spark.read.jdbc(url=jdbcUrl, table=query, properties=connectionProperties).collect()

  ''' for better visualization
  select sum([HorasRealizadas]) as TotalHorasRealizadas
  from (
      select [CodigoProjeto], [Username], [Ano], [Mes], [FK_TarefaID], sum([NHoras]) as HorasRealizadas 
  from [stg].[EXT_TBL_IMPUTACAO_DETALHE] 
  group by [CodigoProjeto], [Username], [Ano], [Mes], [FK_TarefaID]) as T
  where [CodigoProjeto] = '" + project[0] + "'
  group by [CodigoProjeto]
  '''

  if not realized:
    classification_id = 6
  else:
    #query = "(select [Horas_Previstas_Projeto] from [dwProdutividade].[DIM_PROJETO] where [Codigo_Projeto] = '" + project[0] + "') AS query"
    #expected = spark.read.jdbc(url=jdbcUrl, table=query, properties=connectionProperties).collect()
    #if not expected:
    if not project_info:       # declared before with the needed info
      classification_id = 6
    else:
      if project_info[0][1] == 0:
        classification_id = 6
      else:
        quotient = realized[0][0] / project_info[0][1]
        print(quotient)

        if quotient < 1:
          classification_id = 0
        elif quotient == 1:
          classification_id = 1
        elif quotient <= 1.25:
          classification_id = 2
        elif quotient <= 1.50:
          classification_id = 3
        elif quotient <= 1.75:
          classification_id = 4
        else: 
          classification_id = 5


  df = spark.createDataFrame([(id, advance, hours_performed, state_id, project_id, calendar_id, classification_id)], ["ID", "Avanco_Projeto", "Horas_Realizadas_Projeto", "ID_Estado", "ID_Projeto", "ID_Calendario", "ID_Classificacao_Produtividade_Projeto"])
  #df.write.jdbc(url=jdbcUrl, table="[dwProdutividade].[FACTO_PROJETO]", mode="append", properties=connectionProperties)

  print(str(id) + " | " + str(advance) + "% | " + str(hours_performed) + " horas realizadas | " + str(state_id) + " | " +  str(project_id) + " | " + str(calendar_id) + " | " + str(classification_id))

  i += 1