In [None]:
from IPython.display import Image
Image("../input/careervillage-image/CareerVillage_logo_wide_FullColor_785x100.png")

CareerVillage.org is an online platform that allows students to ask questions and ask for career advice. The site recognizes the topic of every question, and it matches them with the right members of a pool of over 16,000 professionals, who volunteer to help kids by answering their questions as thoroughly as possible.

The questions are varied, and range from very specific inquiries regarding a very particular field, to more general subjects related to life advice, such as if it’s better to go straight to college after school, or if taking a sabbatical year is a better idea. No matter what is asked, one or more mentors will try to help by providing an answer as candid as possible, helping students get a better picture before they make up their minds.

In [None]:
import numpy as np
import os
import pandas as pd
import sys
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(color_codes=True)
from wordcloud import WordCloud,STOPWORDS
import warnings
warnings.filterwarnings('ignore')
# plotly
import plotly.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go

In [None]:
answers = pd.read_csv('../input/data-science-for-good-careervillage/answers.csv')
comments = pd.read_csv('../input/data-science-for-good-careervillage/comments.csv')
emails = pd.read_csv("../input/data-science-for-good-careervillage/emails.csv")
group_memberships = pd.read_csv('../input/data-science-for-good-careervillage/group_memberships.csv')
groups = pd.read_csv('../input/data-science-for-good-careervillage/groups.csv')
matches = pd.read_csv('../input/data-science-for-good-careervillage/matches.csv')
professionals = pd.read_csv("../input/data-science-for-good-careervillage/professionals.csv")
questions = pd.read_csv('../input/data-science-for-good-careervillage/questions.csv')
school_memberships = pd.read_csv('../input/data-science-for-good-careervillage/school_memberships.csv')
students = pd.read_csv('../input/data-science-for-good-careervillage/students.csv')
tag_questions = pd.read_csv("../input/data-science-for-good-careervillage/tag_questions.csv")
tag_users = pd.read_csv('../input/data-science-for-good-careervillage/tag_users.csv')
tags = pd.read_csv('../input/data-science-for-good-careervillage/tags.csv')

1.   [Answers](#answers)
2.   [Comments](#comments)
3.   [Emails](#emails)
4.   [Group Memberships](#group-memberships)
5.   [Groups](#groups)
6.   [Matches](#matches)
7.   [Professionals](#professionals)
8.   [Questions](#questions)
9.   [School Memberships](#school-memberships)
10. [Students](#students)
11. [Tag Questions](#tag-questions)
12. [Tag Users](#tag users)
13. [Tags](#tags)

# 1. <a id="answers"> Answers </a>

In [None]:
answers.head()

In [None]:
answers.describe()

In [None]:
words = answers['answers_body'][~pd.isnull(answers["answers_body"])]
wordcloud = WordCloud(max_font_size=50, width=600, height=300).generate(' '.join(words))
plt.figure(figsize=(16,12))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title("Answers")
plt.axis("off")

# 2. <a id="comments"> Comments </a>

In [None]:
comments.head()

In [None]:
comments.describe()

In [None]:
words = comments['comments_body'][~pd.isnull(comments["comments_body"])]
wordcloud = WordCloud(max_font_size=50, width=600, height=300).generate(' '.join(words))
plt.figure(figsize=(16,12))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title("Comments")
plt.axis("off")

# 3. <a id="emails"> Emails </a>

In [None]:
emails.head()

In [None]:
emails.describe()

In [None]:
emails.info()

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(emails['emails_frequency_level'].values,emails['emails_frequency_level'].index)
plt.xlabel("emails_frequency_level", fontsize=15)
plt.ylabel("Count", fontsize=15)
plt.show()

# 4. <a id="group-memberships"> Group Memberships </a>

In [None]:
group_memberships.head()

In [None]:
group_memberships.describe()

# 5. <a id="groups"> Groups </a>

In [None]:
groups.head()

In [None]:
groups.describe()

In [None]:
sorted_groups = groups['groups_group_type'].value_counts()
plt.figure(figsize=(12,8))
sns.barplot(sorted_groups.values,sorted_groups.index)
plt.xlabel("Count", fontsize=15)
plt.ylabel("Group Type", fontsize=15)
plt.show()

# 6. <a id="matches"> Matches </a>

In [None]:
matches.head()

In [None]:
matches.describe()

# 7. <a id="professionals"> Professionals </a>

In [None]:
professionals.head()

In [None]:
professionals.describe()

In [None]:
professionals.isnull().sum()

## **Top 10 locations of professionals in the network**

In [None]:
locations = professionals['professionals_location'].value_counts().head(10)
plt.figure(figsize=(12,8))
sns.barplot(locations.values, locations.index)
plt.xlabel("Count", fontsize=15)
plt.ylabel("Location", fontsize=15)
plt.show()

Most professionals belong to *New York.*

## **Top 10 industries from which professionals belong**

In [None]:
industries = professionals['professionals_industry'].value_counts().head(10)
plt.figure(figsize=(12,8))
sns.barplot(industries.values, industries.index)
plt.xlabel("Count", fontsize=15)
plt.ylabel("Industry", fontsize=15)
plt.show()

Most professionals belong to *Telecommunications*.

## **Headlines**

In [None]:
headlines = professionals['professionals_headline'].value_counts().head(10)
plt.figure(figsize=(12,8))
sns.barplot(headlines.values, headlines.index)
plt.xlabel("Count", fontsize=15)
plt.ylabel("Headlines", fontsize=15)
plt.show()

In [None]:
words = professionals['professionals_headline'][~pd.isnull(professionals['professionals_headline'])]
wordcloud = WordCloud(max_font_size=50, width=600, height=300).generate(' '.join(words))
plt.figure(figsize=(16,12))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title("Headlines")
plt.axis("off")

# 8. <a id="questions"> Questions </a>

In [None]:
questions.head()

In [None]:
questions.describe()

In [None]:
words = questions['questions_title'][~pd.isnull(questions['questions_title'])]
wordcloud = WordCloud(max_font_size=50, width=600, height=300).generate(' '.join(words))
plt.figure(figsize=(16,12))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title("Questions")
plt.axis("off")

In [None]:
words = questions['questions_body'][~pd.isnull(questions['questions_body'])]
wordcloud = WordCloud(max_font_size=50, width=600, height=300).generate(' '.join(words))
plt.figure(figsize=(16,12))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title("Questions body")
plt.axis("off")

# 9. <a id="school-memberships"> School Memberships </a>

In [None]:
school_memberships.head()

In [None]:
school_memberships.describe()

# 10. <a id="students"> Students </a>

In [None]:
students.head()

In [None]:
students.describe()

In [None]:
students.isnull().sum()

### **Top 10 locations of students**

In [None]:
locations = students['students_location'].value_counts().head(10)
plt.figure(figsize=(12,8))
sns.barplot(locations.values, locations.index)
plt.xlabel("Count", fontsize=15)
plt.ylabel("Location", fontsize=15)
plt.show()

# 11. <a id="tag-questions"> Tag Questions </a>

In [None]:
tag_questions.head()

In [None]:
tag_questions.describe()

# 12. <a id="tag-users"> Tag Users </a>

In [None]:
tag_users.head()

In [None]:
tag_users.describe()

# 13. <a id="tags"> Tags </a>

In [None]:
tags.head()

In [None]:
tags.describe()

### **Top 10 tags**

In [None]:
tags_name = tags['tags_tag_name'].value_counts().head(10)
plt.figure(figsize=(12,8))
sns.barplot(tags_name.values, tags_name.index)
plt.xlabel("Count", fontsize=15)
plt.ylabel("Tags", fontsize=15)
plt.show()

**If you like this kernel, you can upvote, and if you have any suggestions please write in the comments.** *Thank you!!*

# **More to come... Stay Tuned :)**