In [None]:
!pip install pandarallel psycopg2

import os
import pandas as pd
from sqlalchemy import create_engine, Column, Integer, String, ForeignKey
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm.session import sessionmaker

from IPython.display import display, HTML

from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
db_name = os.getenv('DB_NAME')
db_user = os.getenv('POSTGRES_USER')
db_password = os.getenv('POSTGRES_PASSWORD')
db_host = 'db'

%env PGPASSWORD=$db_password

Session = sessionmaker()
db = create_engine(f"postgres://{db_user}:{db_password}@{db_host}/{db_name}")
Session.configure(bind=db)
session = Session()

In [None]:
users = pd.read_sql("select * from users", session.bind).set_index('id', drop=False)
projects = pd.read_sql("select * from projects", session.bind).set_index('id', drop=False)
chapters = pd.read_sql("select * from chapters", session.bind).set_index('id', drop=False)

In [None]:
projects.columns

In [None]:
funded_projects = projects[projects["funded_on"].notnull()]

In [None]:
len(funded_projects)

In [None]:
funded_projects.columns
funded_projects.index

In [None]:
funded_projects.groupby(by=['chapter_id'])[["id"]].agg(["count"]).join(chapters).sort_values(by=('id', 'count'), ascending=False)

In [None]:
projects.rename(columns={'id':'project_id'})

In [None]:
en_projects = pd.merge(projects.rename(columns={'id':'project_id'}), chapters.rename(columns={'id':'chapter_id'}), 
         left_on="chapter_id", right_on="chapter_id", suffixes=('_project', '_chapter')).query("locale == 'en'")
en_projects

In [None]:
projects.columns

In [None]:
chapters.columns

In [None]:
vectorizer = TfidfVectorizer(stop_words='english')

display(HTML('<h2>All projects...</h2>'))
all_projects = vectorizer.fit_transform(en_projects['about_project'].values)

display(HTML('<h2>Funded projects...</h2>'))
funded_projects = vectorizer.fit_transform(en_projects[en_projects['funded_on'].notnull()]['about_project'].values)

display(HTML('<h2>Unfunded projects...</h2>'))
unfunded_projects = vectorizer.fit_transform(en_projects[en_projects['funded_on'].isnull()]['about_project'].values)

In [None]:
response = vectorizer.transform([" ".join(f for f in vectorizer.get_feature_names() if str(f[0]).isalpha())])

feature_names = vectorizer.get_feature_names()

In [None]:
df = pd.DataFrame([(feature_names[col], response[0, col]) for col in response.nonzero()[1][0:1000] if response[0, col] > 0], columns=["token", "tfidf"])
df.sort_values(by=["tfidf"], ascending=True)

In [None]:
projects[projects['funded_on'].notnull()]['about_project'].values[0]